You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/07 18:18:07 UTC
svn commit: r1056401 - in /nutch/branches/branch-1.3: ./ src/plugin/
src/plugin/protocol-file/ src/plugin/protocol-file/sample/
src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/
src/plugin/protocol-file/src/test/org/apache/nutch/protoco...
Author: jnioche
Date: Fri Jan 7 17:18:06 2011
New Revision: 1056401
URL: http://svn.apache.org/viewvc?rev=1056401&view=rev
Log:
NUTCH-824 FileProtocol does not resolve encoded URLs
Added:
nutch/branches/branch-1.3/src/plugin/protocol-file/sample/
nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt
nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
Removed:
nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt
Modified:
nutch/branches/branch-1.3/CHANGES.txt
nutch/branches/branch-1.3/src/plugin/build.xml
nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml
nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Fri Jan 7 17:18:06 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.3 - Current Development
+* NUTCH-824 Crawling - File Error 404 when fetching file with an hexadecimal character in the file name (Michela Becchi via jnioche)
+
* NUTCH-954 Strict application of Content-Length limit for http protocols (Alexis Detreglode via jnioche)
* NUTCH-950 DomainURLFilter throws NPE on bogus urls (Alexis Detreglode via jnioche)
Modified: nutch/branches/branch-1.3/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/build.xml?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/build.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/build.xml Fri Jan 7 17:18:06 2011
@@ -74,6 +74,7 @@
<ant dir="index-more" target="test"/>
<ant dir="languageidentifier" target="test"/>
<ant dir="lib-http" target="test"/>
+ <ant dir="protocol-file" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
<!--ant dir="parse-ext" target="test"/-->
<ant dir="parse-rss" target="test"/>
Modified: nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml Fri Jan 7 17:18:06 2011
@@ -18,5 +18,12 @@
<project name="protocol-file" default="jar-core">
<import file="../build-plugin.xml"/>
-
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.txt"/>
+ </fileset>
+ </copy>
</project>
Added: nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt?rev=1056401&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt (added)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt Fri Jan 7 17:18:06 2011
@@ -0,0 +1 @@
+Protocol File Test
Added: nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_%28encoded%29.txt?rev=1056401&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt (added)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt Fri Jan 7 17:18:06 2011
@@ -0,0 +1 @@
+Protocol File Test
Modified: nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Fri Jan 7 17:18:06 2011
@@ -22,6 +22,7 @@ import java.net.URL;
import java.util.Date;
import java.util.TreeMap;
import java.io.IOException;
+import java.io.UnsupportedEncodingException;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
@@ -37,31 +38,27 @@ import org.apache.tika.mime.MimeType;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-
/************************************
- * FileResponse.java mimics file replies as http response.
- * It tries its best to follow http's way for headers, response codes
- * as well as exceptions.
- *
- * Comments:
- * (1) java.net.URL and java.net.URLConnection can handle file: scheme.
- * However they are not flexible enough, so not used in this implementation.
- *
- * (2) java.io.File is used for its abstractness across platforms.
- * Warning:
- * java.io.File API (1.4.2) does not elaborate on how special files,
- * such as /dev/* in unix and /proc/* on linux, are treated. Tests show
- * (a) java.io.File.isFile() return false for /dev/*
- * (b) java.io.File.isFile() return true for /proc/*
- * (c) java.io.File.length() return 0 for /proc/*
- * We are probably oaky for now. Could be buggy here.
- * How about special files on windows?
- *
- * (3) java.io.File API (1.4.2) does not seem to know unix hard link files.
- * They are just treated as individual files.
- *
+ * FileResponse.java mimics file replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ *
+ * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
+ * scheme. However they are not flexible enough, so not used in this
+ * implementation.
+ *
+ * (2) java.io.File is used for its abstractness across platforms. Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files, such as
+ * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
+ * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
+ * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
+ * probably oaky for now. Could be buggy here. How about special files on
+ * windows?
+ *
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
+ * are just treated as individual files.
+ *
* (4) No funcy POSIX file attributes yet. May never need?
- *
+ *
* @author John Xing
***********************************/
public class FileResponse {
@@ -75,25 +72,28 @@ public class FileResponse {
private final File file;
private Configuration conf;
-
+
private MimeUtil MIME;
/** Returns the response code. */
- public int getCode() { return code; }
+ public int getCode() {
+ return code;
+ }
/** Returns the value of a named header. */
public String getHeader(String name) {
return headers.get(name);
}
- public byte[] getContent() { return content; }
+ public byte[] getContent() {
+ return content;
+ }
public Content toContent() {
return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
- getHeader(Response.CONTENT_TYPE),
- headers, this.conf);
+ getHeader(Response.CONTENT_TYPE), headers, this.conf);
}
-
+
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
throws FileException, IOException {
@@ -120,6 +120,12 @@ public class FileResponse {
String path = "".equals(url.getPath()) ? "/" : url.getPath();
try {
+ // specify the encoding via the config later?
+ path = java.net.URLDecoder.decode(path, "UTF-8");
+ } catch (UnsupportedEncodingException ex) {
+ }
+
+ try {
this.content = null;
@@ -170,45 +176,46 @@ public class FileResponse {
}
// get file as http response
- private void getFileAsHttpResponse(java.io.File f)
- throws FileException, IOException {
+ private void getFileAsHttpResponse(java.io.File f) throws FileException,
+ IOException {
// ignore file of size larger than
// Integer.MAX_VALUE = 2^31-1 = 2147483647
long size = f.length();
if (size > Integer.MAX_VALUE) {
- throw new FileException("file is too large, size: "+size);
+ throw new FileException("file is too large, size: " + size);
// or we can do this?
- // this.code = 400; // http Bad request
+ // this.code = 400; // http Bad request
// return;
}
// capture content
int len = (int) size;
-
+
if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
len = this.file.maxContentLength;
this.content = new byte[len];
java.io.InputStream is = new java.io.FileInputStream(f);
- int offset = 0; int n = 0;
+ int offset = 0;
+ int n = 0;
while (offset < len
- && (n = is.read(this.content, offset, len-offset)) >= 0) {
+ && (n = is.read(this.content, offset, len - offset)) >= 0) {
offset += n;
}
if (offset < len) { // keep whatever already have, but issue a warning
if (File.LOG.isWarnEnabled()) {
- File.LOG.warn("not enough bytes read from file: "+f.getPath());
+ File.LOG.warn("not enough bytes read from file: " + f.getPath());
}
}
- is.close();
+ is.close();
// set headers
headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
- headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
- .lastModified()));
-
+ headers.set(Response.LAST_MODIFIED,
+ HttpDateFormat.toString(f.lastModified()));
+
MimeType mimeType = MIME.getMimeType(f);
String mimeTypeString = mimeType != null ? mimeType.getName() : "";
headers.set(Response.CONTENT_TYPE, mimeTypeString);
@@ -218,33 +225,33 @@ public class FileResponse {
}
// get dir list as http response
- private void getDirAsHttpResponse(java.io.File f)
- throws IOException {
+ private void getDirAsHttpResponse(java.io.File f) throws IOException {
String path = f.toString();
if (this.file.crawlParents)
- this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true);
+ this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
+ : true);
else
- this.content = list2html(f.listFiles(), path, false);
+ this.content = list2html(f.listFiles(), path, false);
// set headers
headers.set(Response.CONTENT_LENGTH,
- new Integer(this.content.length).toString());
+ new Integer(this.content.length).toString());
headers.set(Response.CONTENT_TYPE, "text/html");
headers.set(Response.LAST_MODIFIED,
- HttpDateFormat.toString(f.lastModified()));
+ HttpDateFormat.toString(f.lastModified()));
// response code
this.code = 200; // http OK
}
// generate html page from dir list
- private byte[] list2html(java.io.File[] list,
- String path, boolean includeDotDot) {
+ private byte[] list2html(java.io.File[] list, String path,
+ boolean includeDotDot) {
StringBuffer x = new StringBuffer("<html><head>");
- x.append("<title>Index of "+path+"</title></head>\n");
- x.append("<body><h1>Index of "+path+"</h1><pre>\n");
+ x.append("<title>Index of " + path + "</title></head>\n");
+ x.append("<body><h1>Index of " + path + "</h1><pre>\n");
if (includeDotDot) {
x.append("<a href='../'>../</a>\t-\t-\t-\n");
@@ -253,20 +260,20 @@ public class FileResponse {
// fix me: we might want to sort list here! but not now.
java.io.File f;
- for (int i=0; i<list.length; i++) {
+ for (int i = 0; i < list.length; i++) {
f = list[i];
String name = f.getName();
String time = HttpDateFormat.toString(f.lastModified());
if (f.isDirectory()) {
// java 1.4.2 api says dir itself and parent dir are not listed
// so the following is not needed.
- //if (name.equals(".") || name.equals(".."))
- // continue;
- x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
- x.append(time+"\t-\n");
+ // if (name.equals(".") || name.equals(".."))
+ // continue;
+ x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+ x.append(time + "\t-\n");
} else if (f.isFile()) {
- x.append("<a href='"+name+ "'>"+name+"</a>\t");
- x.append(time+"\t"+f.length()+"\n");
+ x.append("<a href='" + name + "'>" + name + "</a>\t");
+ x.append(time + "\t" + f.length() + "\n");
} else {
// ignore any other
}
Modified: nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java (original)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java Fri Jan 7 17:18:06 2011
@@ -18,11 +18,15 @@
package org.apache.nutch.protocol.file;
// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.NutchConfiguration;
@@ -34,37 +38,47 @@ import junit.framework.TestCase;
* @author mattmann
* @version $Revision$
*
- * <p>
- * Unit tests for the {@link File}Protocol.
- * </p>.
+ * <p>
+ * Unit tests for the {@link File}Protocol.
+ * </p>
+ * .
*/
public class TestProtocolFile extends TestCase {
- private static final org.apache.nutch.protocol.file.File fileProtocol =
- new org.apache.nutch.protocol.file.File();
+ private String fileSeparator = System.getProperty("file.separator");
+ private String sampleDir = System.getProperty("test.data", ".");
- private static final String testTextFile = "testprotocolfile.txt";
+ private static final String[] testTextFiles = new String[] {
+ "testprotocolfile.txt", "testprotocolfile_(encoded).txt", "testprotocolfile_%28encoded%29.txt" };
private static final CrawlDatum datum = new CrawlDatum();
private static final String expectedMimeType = "text/plain";
- static {
- fileProtocol.setConf(NutchConfiguration.create());
+ private Configuration conf;
+
+ protected void setUp() {
+ conf = NutchConfiguration.create();
+ }
+
+ public void testSetContentType() throws ProtocolException {
+ for (String testTextFile : testTextFiles) {
+ setContentType(testTextFile);
+ }
}
/**
- * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata
- * field.
+ * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
*
* @since NUTCH-384
*
*/
- public void testSetContentType() {
- Text fileUrl = new Text(this.getClass().getResource(testTextFile)
- .toString());
- assertNotNull(fileUrl);
- ProtocolOutput output = fileProtocol.getProtocolOutput(fileUrl, datum);
+ public void setContentType(String testTextFile) throws ProtocolException {
+ String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
+ assertNotNull(urlString);
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
+ datum);
assertNotNull(output);
assertEquals("Status code: [" + output.getStatus().getCode()
+ "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
@@ -74,8 +88,8 @@ public class TestProtocolFile extends Te
assertNotNull(output.getContent().getContentType());
assertEquals(expectedMimeType, output.getContent().getContentType());
assertNotNull(output.getContent().getMetadata());
- assertEquals(expectedMimeType, output.getContent().getMetadata().get(
- Response.CONTENT_TYPE));
+ assertEquals(expectedMimeType,
+ output.getContent().getMetadata().get(Response.CONTENT_TYPE));
}