You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/06/11 17:49:45 UTC

svn commit: r1601935 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/metadata/HttpHeaders.java src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Author: jnioche
Date: Wed Jun 11 15:49:45 2014
New Revision: 1601935

URL: http://svn.apache.org/r1601935
Log:
NUTCH-1736 Can't fetch page if http response header contains Transfer-Encoding:chunked

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
    nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1601935&r1=1601934&r2=1601935&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jun 11 15:49:45 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1736 Can't fetch page if http response header contains Transfer-Encoding:chunked (ysc via jnioche)
+
 * NUTCH-1782 NodeWalker to return current node (markus)
 
 * NUTCH-1758 IndexChecker to send document to IndexWriters (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1601935&r1=1601934&r2=1601935&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Wed Jun 11 15:49:45 2014
@@ -26,6 +26,8 @@ import org.apache.hadoop.io.Text;
  */
 public interface HttpHeaders {
 
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+	
   public final static String CONTENT_ENCODING = "Content-Encoding";
   
   public final static String CONTENT_LANGUAGE = "Content-Language";

Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1601935&r1=1601934&r2=1601935&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Wed Jun 11 15:49:45 2014
@@ -213,8 +213,13 @@ public class HttpResponse implements Res
         parseHeaders(in, line);
         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
       }
-
-      readPlainContent(in);
+      String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+      if (transferEncoding != null
+          && "chunked".equalsIgnoreCase(transferEncoding.trim())) {
+        readChunkedContent(in, line);
+      } else {
+        readPlainContent(in);
+      }
 
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
@@ -339,7 +344,7 @@ public class HttpResponse implements Res
         break;
       }
 
-      if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
+      if ( http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http.getMaxContent() )
         chunkLen= http.getMaxContent() - contentBytesRead;
 
       // read one chunk