You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/11/04 22:07:42 UTC
svn commit: r1636736 - in /nutch: branches/2.x/ branches/2.x/conf/
branches/2.x/src/java/org/apache/nutch/util/
branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/
branches/2.x/src/plugin/urlnormalizer-regex/sample/ branches/...
Author: snagel
Date: Tue Nov 4 21:07:41 2014
New Revision: 1636736
URL: http://svn.apache.org/r1636736
Log:
NUTCH-1483 (including NUTCH-1879, NUTCH-1880, NUTCH-1885) fix errors related to protocol-file
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/conf/regex-normalize.xml.template
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/conf/regex-normalize.xml.template
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Nov 4 21:07:41 2014
@@ -2,6 +2,14 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel)
+
+* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying Wang, snagel)
+
+* NUTCH-1880 URLUtil should not add additional slashes for file URLs (snagel)
+
+* NUTCH-1879 Regex URL normalizer should remove multiple slashes after file: protocol (snagel)
+
* NUTCH-1820 remove field "orig" which duplicates "id" (lewismc, snagel)
* NUTCH-1843 Upgrade to Gora 0.5 (talat, lewismc, Kiril Menshikov, drazzib)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Tue Nov 4 21:07:41 2014
@@ -46,6 +46,19 @@
</property>
<property>
+ <name>file.crawl.redirect_noncanonical</name>
+ <value>true</value>
+ <description>
+ If true, protocol-file treats non-canonical file names as
+ redirects and does not canonicalize file names internally. A file
+ name containing symbolic links as path elements is then not
+ resolved and "fetched" but recorded as redirect with the
+ canonical name (all links on path are resolved) as redirect
+ target.
+ </description>
+</property>
+
+<property>
<name>file.content.ignored</name>
<value>true</value>
<description>If true, no file content will be saved during fetch.
Modified: nutch/branches/2.x/conf/regex-normalize.xml.template
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/regex-normalize.xml.template?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/conf/regex-normalize.xml.template (original)
+++ nutch/branches/2.x/conf/regex-normalize.xml.template Tue Nov 4 21:07:41 2014
@@ -63,7 +63,15 @@
<substitution></substitution>
</regex>
-<!-- removes duplicate slashes -->
+<!-- normalize file:/// protocol prefix: -->
+<!-- keep one single slash (NUTCH-1483) -->
+<regex>
+ <pattern>^file://+</pattern>
+ <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
<regex>
<pattern>(?<!:)/{2,}</pattern>
<substitution>/</substitution>
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Tue Nov 4 21:07:41 2014
@@ -383,9 +383,15 @@ public class URLUtil {
public static String toASCII(String url) {
try {
URL u = new URL(url);
+ String host = u.getHost();
+ if (host == null || host.isEmpty()) {
+ // no host name => no punycoded domain name
+ // also do not add additional slashes for file: URLs (NUTCH-1880)
+ return url;
+ }
URI p = new URI(u.getProtocol(),
u.getUserInfo(),
- IDN.toASCII(u.getHost()),
+ IDN.toASCII(host),
u.getPort(),
u.getPath(),
u.getQuery(),
@@ -401,6 +407,12 @@ public class URLUtil {
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
+ String host = u.getHost();
+ if (host == null || host.isEmpty()) {
+ // no host name => no punycoded domain name
+ // also do not add additional slashes for file: URLs (NUTCH-1880)
+ return url;
+ }
StringBuilder sb = new StringBuilder();
sb.append(u.getProtocol());
sb.append("://");
@@ -408,7 +420,7 @@ public class URLUtil {
sb.append(u.getUserInfo());
sb.append('@');
}
- sb.append(IDN.toUnicode(u.getHost()));
+ sb.append(IDN.toUnicode(host));
if (u.getPort() != -1) {
sb.append(':');
sb.append(u.getPort());
Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Tue Nov 4 21:07:41 2014
@@ -54,6 +54,12 @@ public class File implements Protocol {
boolean crawlParents;
+ /**
+ * if true return a redirect for symbolic links and do not resolve the links
+ * internally
+ */
+ boolean symlinksAsRedirects = true;
+
private Configuration conf;
// constructor
@@ -66,6 +72,8 @@ public class File implements Protocol {
this.conf = conf;
this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+ this.symlinksAsRedirects = conf.getBoolean(
+ "file.crawl.redirect_noncanonical", true);
}
/**
@@ -116,13 +124,20 @@ public class File implements Protocol {
return new ProtocolOutput(response.toContent(), ProtocolStatusUtils.STATUS_NOTFOUND);
} else if (code >= 300 && code < 400) { // handle redirect
- if (redirects == MAX_REDIRECTS)
- throw new FileException("Too many redirects: " + url);
u = new URL(response.getHeader("Location"));
- redirects++;
if (LOG.isTraceEnabled()) {
LOG.trace("redirect to " + u);
}
+ if (symlinksAsRedirects) {
+ return new ProtocolOutput(response.toContent(),
+ ProtocolStatusUtils.makeStatus(ProtocolStatusUtils.MOVED, u));
+ } else if (redirects == MAX_REDIRECTS) {
+ LOG.trace("Too many redirects: {}", url);
+ return new ProtocolOutput(response.toContent(),
+ ProtocolStatusUtils.makeStatus(
+ ProtocolStatusUtils.REDIR_EXCEEDED, u));
+ }
+ redirects++;
} else { // convert to exception
throw new FileError(code);
@@ -174,14 +189,26 @@ public class File implements Protocol {
if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
file.setMaxContentLength(maxContentLength);
- Content content = file.getProtocolOutput(urlString, WebPage.newBuilder().build())
- .getContent();
-
+ ProtocolOutput output = file.getProtocolOutput(urlString, WebPage
+ .newBuilder().build());
+ Content content = output.getContent();
+
+ System.err.println("URL: " + content.getUrl());
+ ProtocolStatus status = output.getStatus();
+ String protocolMessage = ProtocolStatusUtils.getMessage(status);
+ System.err.println("Status: "
+ + ProtocolStatusUtils.getName(status.getCode())
+ + (protocolMessage == null ? "" : ": " + protocolMessage));
System.out.println("Content-Type: " + content.getContentType());
System.out.println("Content-Length: "
+ content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Last-Modified: "
+ content.getMetadata().get(Response.LAST_MODIFIED));
+ String redirectLocation = content.getMetadata().get("Location");
+ if (redirectLocation != null) {
+ System.err.println("Location: " + redirectLocation);
+ }
+
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Tue Nov 4 21:07:41 2014
@@ -75,3 +75,10 @@ http://www.foo.com/foo.html?&x=y http://
http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
http://www.foo.com/foo.html? http://www.foo.com/foo.html
+# remove double slashes but keep 2 slashes after protocol
+http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
+https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
+
+# normalize file: protocol prefix (keep one slash)
+file:///path//foo.html file:/path/foo.html
+file:/path//foo.html file:/path/foo.html
Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original)
+++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Tue Nov 4 21:07:41 2014
@@ -48,5 +48,19 @@
<substitution></substitution>
</regex>
+<!-- normalize file:/// protocol prefix: -->
+<!-- keep one single slash (NUTCH-1483) -->
+<regex>
+ <pattern>^file://+</pattern>
+ <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
+<regex>
+ <pattern>(?<!:)/{2,}</pattern>
+ <substitution>/</substitution>
+</regex>
+
</regex-normalize>
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Tue Nov 4 21:07:41 2014
@@ -277,4 +277,11 @@ public class TestURLUtil {
URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
}
+ @Test
+ public void testFileProtocol() throws Exception {
+ // keep one single slash NUTCH-XXX
+ assertEquals("file:/path/file.html", URLUtil.toASCII("file:/path/file.html"));
+ assertEquals("file:/path/file.html", URLUtil.toUNICODE("file:/path/file.html"));
+ }
+
}
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov 4 21:07:41 2014
@@ -2,6 +2,14 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel)
+
+* NUTCH-1885 Protocol-file should treat symbolic links as redirects (Mengying Wang, snagel)
+
+* NUTCH-1880 URLUtil should not add additional slashes for file URLs (snagel)
+
+* NUTCH-1879 Regex URL normalizer should remove multiple slashes after file: protocol (snagel)
+
* NUTCH-1883 bin/crawl: use function to run bin/nutch and check exit value (snagel)
* NUTCH-1865 Enable use of SNAPSHOT's with Nutch Ivy dependency management (lewismc)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Nov 4 21:07:41 2014
@@ -56,6 +56,19 @@
</property>
<property>
+ <name>file.crawl.redirect_noncanonical</name>
+ <value>true</value>
+ <description>
+ If true, protocol-file treats non-canonical file names as
+ redirects and does not canonicalize file names internally. A file
+ name containing symbolic links as path elements is then not
+ resolved and "fetched" but recorded as redirect with the
+ canonical name (all links on path are resolved) as redirect
+ target.
+ </description>
+</property>
+
+<property>
<name>file.content.ignored</name>
<value>true</value>
<description>If true, no file content will be saved during fetch.
Modified: nutch/trunk/conf/regex-normalize.xml.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Tue Nov 4 21:07:41 2014
@@ -63,7 +63,15 @@
<substitution></substitution>
</regex>
-<!-- removes duplicate slashes -->
+<!-- normalize file:/// protocol prefix: -->
+<!-- keep one single slash (NUTCH-1483) -->
+<regex>
+ <pattern>^file://+</pattern>
+ <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
<regex>
<pattern>(?<!:)/{2,}</pattern>
<substitution>/</substitution>
Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Tue Nov 4 21:07:41 2014
@@ -425,9 +425,15 @@ public class URLUtil {
public static String toASCII(String url) {
try {
URL u = new URL(url);
+ String host = u.getHost();
+ if (host == null || host.isEmpty()) {
+ // no host name => no punycoded domain name
+ // also do not add additional slashes for file: URLs (NUTCH-1880)
+ return url;
+ }
URI p = new URI(u.getProtocol(),
u.getUserInfo(),
- IDN.toASCII(u.getHost()),
+ IDN.toASCII(host),
u.getPort(),
u.getPath(),
u.getQuery(),
@@ -443,6 +449,12 @@ public class URLUtil {
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
+ String host = u.getHost();
+ if (host == null || host.isEmpty()) {
+ // no host name => no punycoded domain name
+ // also do not add additional slashes for file: URLs (NUTCH-1880)
+ return url;
+ }
StringBuilder sb = new StringBuilder();
sb.append(u.getProtocol());
sb.append("://");
@@ -450,7 +462,7 @@ public class URLUtil {
sb.append(u.getUserInfo());
sb.append('@');
}
- sb.append(IDN.toUnicode(u.getHost()));
+ sb.append(IDN.toUnicode(host));
if (u.getPort() != -1) {
sb.append(':');
sb.append(u.getPort());
Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Tue Nov 4 21:07:41 2014
@@ -53,6 +53,12 @@ public class File implements Protocol {
int maxContentLength;
boolean crawlParents;
+ /**
+ * if true return a redirect for symbolic links and do not resolve the links
+ * internally
+ */
+ boolean symlinksAsRedirects = true;
+
private Configuration conf;
public File() {}
@@ -64,6 +70,8 @@ public class File implements Protocol {
this.conf = conf;
this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+ this.symlinksAsRedirects = conf.getBoolean(
+ "file.crawl.redirect_noncanonical", true);
}
/**
@@ -115,13 +123,19 @@ public class File implements Protocol {
return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);
} else if (code >= 300 && code < 400) { // handle redirect
- if (redirects == MAX_REDIRECTS)
- throw new FileException("Too many redirects: " + url);
u = new URL(response.getHeader("Location"));
- redirects++;
if (LOG.isTraceEnabled()) {
LOG.trace("redirect to " + u);
}
+ if (symlinksAsRedirects) {
+ return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+ ProtocolStatus.MOVED, u));
+ } else if (redirects == MAX_REDIRECTS) {
+ LOG.trace("Too many redirects: {}", url);
+ return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+ ProtocolStatus.REDIR_EXCEEDED, u));
+ }
+ redirects++;
} else { // convert to exception
throw new FileError(code);
@@ -172,13 +186,21 @@ public class File implements Protocol {
// set log level
//LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+ ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
+ Content content = output.getContent();
+ System.err.println("URL: " + content.getUrl());
+ System.err.println("Status: " + output.getStatus());
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " +
content.getMetadata().get(Response.CONTENT_LENGTH));
System.err.println("Last-Modified: " +
content.getMetadata().get(Response.LAST_MODIFIED));
+ String redirectLocation = content.getMetadata().get("Location");
+ if (redirectLocation != null) {
+ System.err.println("Location: " + redirectLocation);
+ }
+
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Tue Nov 4 21:07:41 2014
@@ -75,3 +75,10 @@ http://www.foo.com/foo.html?&x=y http://
http://www.foo.com/foo.html?x=y&&&z=a http://www.foo.com/foo.html?x=y&z=a
http://www.foo.com/foo.html? http://www.foo.com/foo.html
+# remove double slashes but keep 2 slashes after protocol
+http://www.foo.com//path//foo.html http://www.foo.com/path/foo.html
+https://www.foo.com//path//foo.html https://www.foo.com/path/foo.html
+
+# normalize file: protocol prefix (keep one slash)
+file:///path//foo.html file:/path/foo.html
+file:/path//foo.html file:/path/foo.html
Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Tue Nov 4 21:07:41 2014
@@ -48,5 +48,19 @@
<substitution></substitution>
</regex>
+<!-- normalize file:/// protocol prefix: -->
+<!-- keep one single slash (NUTCH-1483) -->
+<regex>
+ <pattern>^file://+</pattern>
+ <substitution>file:/</substitution>
+</regex>
+
+<!-- removes duplicate slashes but -->
+<!-- * allow 2 slashes after colon ':' (indicating protocol) -->
+<regex>
+ <pattern>(?<!:)/{2,}</pattern>
+ <substitution>/</substitution>
+</regex>
+
</regex-normalize>
Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1636736&r1=1636735&r2=1636736&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Tue Nov 4 21:07:41 2014
@@ -275,4 +275,11 @@ public class TestURLUtil {
URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
}
+ @Test
+ public void testFileProtocol() throws Exception {
+ // keep one single slash NUTCH-XXX
+ Assert.assertEquals("file:/path/file.html", URLUtil.toASCII("file:/path/file.html"));
+ Assert.assertEquals("file:/path/file.html", URLUtil.toUNICODE("file:/path/file.html"));
+ }
+
}