You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/07 18:18:07 UTC

svn commit: r1056401 - in /nutch/branches/branch-1.3: ./ src/plugin/ src/plugin/protocol-file/ src/plugin/protocol-file/sample/ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ src/plugin/protocol-file/src/test/org/apache/nutch/protoco...

Author: jnioche
Date: Fri Jan  7 17:18:06 2011
New Revision: 1056401

URL: http://svn.apache.org/viewvc?rev=1056401&view=rev
Log:
NUTCH-824 FileProtocol does not resolve encoded URLs

Added:
    nutch/branches/branch-1.3/src/plugin/protocol-file/sample/
    nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt
    nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
Removed:
    nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/testprotocolfile.txt
Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/src/plugin/build.xml
    nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml
    nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Fri Jan  7 17:18:06 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - Current Development
 
+* NUTCH-824 Crawling - File Error 404 when fetching file with an hexadecimal character in the file name (Michela Becchi via jnioche)
+
 * NUTCH-954 Strict application of Content-Length limit for http protocols (Alexis Detreglode via jnioche)
 
 * NUTCH-950 DomainURLFilter throws NPE on bogus urls (Alexis Detreglode via jnioche)

Modified: nutch/branches/branch-1.3/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/build.xml?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/build.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/build.xml Fri Jan  7 17:18:06 2011
@@ -74,6 +74,7 @@
      <ant dir="index-more" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="lib-http" target="test"/>
+     <ant dir="protocol-file" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-rss" target="test"/>

Modified: nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/build.xml Fri Jan  7 17:18:06 2011
@@ -18,5 +18,12 @@
 <project name="protocol-file" default="jar-core">
 
   <import file="../build-plugin.xml"/>
-
+  
+ <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.txt"/>
+    </fileset>
+  </copy>
 </project>

Added: nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt?rev=1056401&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt (added)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile.txt Fri Jan  7 17:18:06 2011
@@ -0,0 +1 @@
+Protocol File Test

Added: nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_%28encoded%29.txt?rev=1056401&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt (added)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt Fri Jan  7 17:18:06 2011
@@ -0,0 +1 @@
+Protocol File Test

Modified: nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Fri Jan  7 17:18:06 2011
@@ -22,6 +22,7 @@ import java.net.URL;
 import java.util.Date;
 import java.util.TreeMap;
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
@@ -37,31 +38,27 @@ import org.apache.tika.mime.MimeType;
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 
-
 /************************************
- * FileResponse.java mimics file replies as http response.
- * It tries its best to follow http's way for headers, response codes
- * as well as exceptions.
- *
- * Comments:
- * (1) java.net.URL and java.net.URLConnection can handle file: scheme.
- * However they are not flexible enough, so not used in this implementation.
- *
- * (2) java.io.File is used for its abstractness across platforms.
- * Warning:
- * java.io.File API (1.4.2) does not elaborate on how special files,
- * such as /dev/* in unix and /proc/* on linux, are treated. Tests show
- *  (a) java.io.File.isFile() return false for /dev/*
- *  (b) java.io.File.isFile() return true for /proc/*
- *  (c) java.io.File.length() return 0 for /proc/*
- * We are probably oaky for now. Could be buggy here.
- * How about special files on windows?
- *
- * (3) java.io.File API (1.4.2) does not seem to know unix hard link files.
- * They are just treated as individual files.
- *
+ * FileResponse.java mimics file replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ * 
+ * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
+ * scheme. However they are not flexible enough, so not used in this
+ * implementation.
+ * 
+ * (2) java.io.File is used for its abstractness across platforms. Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files, such as
+ * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
+ * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
+ * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
+ * probably oaky for now. Could be buggy here. How about special files on
+ * windows?
+ * 
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
+ * are just treated as individual files.
+ * 
  * (4) No funcy POSIX file attributes yet. May never need?
- *
+ * 
  * @author John Xing
  ***********************************/
 public class FileResponse {
@@ -75,25 +72,28 @@ public class FileResponse {
 
   private final File file;
   private Configuration conf;
-  
+
   private MimeUtil MIME;
 
   /** Returns the response code. */
-  public int getCode() { return code; }
+  public int getCode() {
+    return code;
+  }
 
   /** Returns the value of a named header. */
   public String getHeader(String name) {
     return headers.get(name);
   }
 
-  public byte[] getContent() { return content; }
+  public byte[] getContent() {
+    return content;
+  }
 
   public Content toContent() {
     return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
-                       getHeader(Response.CONTENT_TYPE),
-                       headers, this.conf);
+        getHeader(Response.CONTENT_TYPE), headers, this.conf);
   }
-  
+
   public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
     throws FileException, IOException {
 
@@ -120,6 +120,12 @@ public class FileResponse {
     String path = "".equals(url.getPath()) ? "/" : url.getPath();
 
     try {
+      // specify the encoding via the config later?
+      path = java.net.URLDecoder.decode(path, "UTF-8");
+    } catch (UnsupportedEncodingException ex) {
+    }
+
+    try {
 
       this.content = null;
 
@@ -170,45 +176,46 @@ public class FileResponse {
   }
 
   // get file as http response
-  private void getFileAsHttpResponse(java.io.File f)
-    throws FileException, IOException {
+  private void getFileAsHttpResponse(java.io.File f) throws FileException,
+      IOException {
 
     // ignore file of size larger than
     // Integer.MAX_VALUE = 2^31-1 = 2147483647
     long size = f.length();
     if (size > Integer.MAX_VALUE) {
-      throw new FileException("file is too large, size: "+size);
+      throw new FileException("file is too large, size: " + size);
       // or we can do this?
-      // this.code = 400;  // http Bad request
+      // this.code = 400; // http Bad request
       // return;
     }
 
     // capture content
     int len = (int) size;
-    
+
     if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
       len = this.file.maxContentLength;
 
     this.content = new byte[len];
 
     java.io.InputStream is = new java.io.FileInputStream(f);
-    int offset = 0; int n = 0;
+    int offset = 0;
+    int n = 0;
     while (offset < len
-      && (n = is.read(this.content, offset, len-offset)) >= 0) {
+        && (n = is.read(this.content, offset, len - offset)) >= 0) {
       offset += n;
     }
     if (offset < len) { // keep whatever already have, but issue a warning
       if (File.LOG.isWarnEnabled()) {
-        File.LOG.warn("not enough bytes read from file: "+f.getPath());
+        File.LOG.warn("not enough bytes read from file: " + f.getPath());
       }
     }
-    is.close(); 
+    is.close();
 
     // set headers
     headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
-    headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
-        .lastModified()));
-    
+    headers.set(Response.LAST_MODIFIED,
+        HttpDateFormat.toString(f.lastModified()));
+
     MimeType mimeType = MIME.getMimeType(f);
     String mimeTypeString = mimeType != null ? mimeType.getName() : "";
     headers.set(Response.CONTENT_TYPE, mimeTypeString);
@@ -218,33 +225,33 @@ public class FileResponse {
   }
 
   // get dir list as http response
-  private void getDirAsHttpResponse(java.io.File f)
-    throws IOException {
+  private void getDirAsHttpResponse(java.io.File f) throws IOException {
 
     String path = f.toString();
     if (this.file.crawlParents)
-        this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true);
+      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
+          : true);
     else
-        this.content = list2html(f.listFiles(), path, false);
+      this.content = list2html(f.listFiles(), path, false);
 
     // set headers
     headers.set(Response.CONTENT_LENGTH,
-      new Integer(this.content.length).toString());
+        new Integer(this.content.length).toString());
     headers.set(Response.CONTENT_TYPE, "text/html");
     headers.set(Response.LAST_MODIFIED,
-      HttpDateFormat.toString(f.lastModified()));
+        HttpDateFormat.toString(f.lastModified()));
 
     // response code
     this.code = 200; // http OK
   }
 
   // generate html page from dir list
-  private byte[] list2html(java.io.File[] list,
-    String path, boolean includeDotDot) {
+  private byte[] list2html(java.io.File[] list, String path,
+      boolean includeDotDot) {
 
     StringBuffer x = new StringBuffer("<html><head>");
-    x.append("<title>Index of "+path+"</title></head>\n");
-    x.append("<body><h1>Index of "+path+"</h1><pre>\n");
+    x.append("<title>Index of " + path + "</title></head>\n");
+    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
 
     if (includeDotDot) {
       x.append("<a href='../'>../</a>\t-\t-\t-\n");
@@ -253,20 +260,20 @@ public class FileResponse {
     // fix me: we might want to sort list here! but not now.
 
     java.io.File f;
-    for (int i=0; i<list.length; i++) {
+    for (int i = 0; i < list.length; i++) {
       f = list[i];
       String name = f.getName();
       String time = HttpDateFormat.toString(f.lastModified());
       if (f.isDirectory()) {
         // java 1.4.2 api says dir itself and parent dir are not listed
         // so the following is not needed.
-        //if (name.equals(".") || name.equals(".."))
-        //  continue;
-        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
-        x.append(time+"\t-\n");
+        // if (name.equals(".") || name.equals(".."))
+        // continue;
+        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+        x.append(time + "\t-\n");
       } else if (f.isFile()) {
-        x.append("<a href='"+name+    "'>"+name+"</a>\t");
-        x.append(time+"\t"+f.length()+"\n");
+        x.append("<a href='" + name + "'>" + name + "</a>\t");
+        x.append(time + "\t" + f.length() + "\n");
       } else {
         // ignore any other
       }

Modified: nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1056401&r1=1056400&r2=1056401&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java (original)
+++ nutch/branches/branch-1.3/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java Fri Jan  7 17:18:06 2011
@@ -18,11 +18,15 @@
 package org.apache.nutch.protocol.file;
 
 // Hadoop imports
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.NutchConfiguration;
@@ -34,37 +38,47 @@ import junit.framework.TestCase;
  * @author mattmann
  * @version $Revision$
  * 
- * <p>
- * Unit tests for the {@link File}Protocol.
- * </p>.
+ *          <p>
+ *          Unit tests for the {@link File}Protocol.
+ *          </p>
+ *          .
  */
 public class TestProtocolFile extends TestCase {
 
-  private static final org.apache.nutch.protocol.file.File fileProtocol = 
-    new org.apache.nutch.protocol.file.File();
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
 
-  private static final String testTextFile = "testprotocolfile.txt";
+  private static final String[] testTextFiles = new String[] {
+      "testprotocolfile.txt", "testprotocolfile_(encoded).txt", "testprotocolfile_%28encoded%29.txt" };
 
   private static final CrawlDatum datum = new CrawlDatum();
 
   private static final String expectedMimeType = "text/plain";
 
-  static {
-    fileProtocol.setConf(NutchConfiguration.create());
+  private Configuration conf;
+
+  protected void setUp() {
+    conf = NutchConfiguration.create();
+  }
+
+  public void testSetContentType() throws ProtocolException {
+    for (String testTextFile : testTextFiles) {
+      setContentType(testTextFile);
+    }
   }
 
   /**
-   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata
-   * field.
+   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
    * 
    * @since NUTCH-384
    * 
    */
-  public void testSetContentType() {
-    Text fileUrl = new Text(this.getClass().getResource(testTextFile)
-        .toString());
-    assertNotNull(fileUrl);
-    ProtocolOutput output = fileProtocol.getProtocolOutput(fileUrl, datum);
+  public void setContentType(String testTextFile) throws ProtocolException {
+    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
+    assertNotNull(urlString);
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
+        datum);
     assertNotNull(output);
     assertEquals("Status code: [" + output.getStatus().getCode()
         + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
@@ -74,8 +88,8 @@ public class TestProtocolFile extends Te
     assertNotNull(output.getContent().getContentType());
     assertEquals(expectedMimeType, output.getContent().getContentType());
     assertNotNull(output.getContent().getMetadata());
-    assertEquals(expectedMimeType, output.getContent().getMetadata().get(
-        Response.CONTENT_TYPE));
+    assertEquals(expectedMimeType,
+        output.getContent().getMetadata().get(Response.CONTENT_TYPE));
 
   }