You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/04/18 11:43:39 UTC
nutch git commit: fix for NUTCH-2191 - fixing Nutch build -
contributed by karanjeets
Repository: nutch
Updated Branches:
refs/heads/master 044e8e77e -> 8572fd955
fix for NUTCH-2191 - fixing Nutch build - contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/8572fd95
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/8572fd95
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/8572fd95
Branch: refs/heads/master
Commit: 8572fd9551b430f31a4fdace14738f2d9959b370
Parents: 044e8e7
Author: Karanjeet Singh <co...@gmail.com>
Authored: Mon Apr 18 00:45:37 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Mon Apr 18 00:45:37 2016 -0700
----------------------------------------------------------------------
src/plugin/protocol-htmlunit/build.xml | 9 -
.../nutch/protocol/htmlunit/HttpResponse.java | 408 ++++++++++++++-----
2 files changed, 317 insertions(+), 100 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml
index bf695fe..899214c 100644
--- a/src/plugin/protocol-htmlunit/build.xml
+++ b/src/plugin/protocol-htmlunit/build.xml
@@ -34,13 +34,4 @@
<pathelement location="${build.dir}/test/conf"/>
</path>
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../lib-http"/>
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <copy toDir="${build.test}">
- <fileset dir="${src.test}" excludes="**/*.java"/>
- </copy>
- </target>
-
</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/8572fd95/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 7242f40..8b1a031 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -20,11 +20,18 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
@@ -35,46 +42,78 @@ import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.protocol.http.api.HttpException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+/**
+ * An HTTP response.
+ */
public class HttpResponse implements Response {
- private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class);
-
- private Http http;
+ private Configuration conf;
+ private HttpBase http;
private URL url;
+ private String orig;
+ private String base;
private byte[] content;
private int code;
private Metadata headers = new SpellCheckedMetadata();
+ // used for storing the http headers verbatim
+ private StringBuffer httpHeaders;
- /** The nutch configuration */
- private Configuration conf = null;
+ protected enum Scheme {
+ HTTP, HTTPS,
+ }
- public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+ /**
+ * Default public constructor.
+ *
+ * @param http
+ * @param url
+ * @param datum
+ * @throws ProtocolException
+ * @throws IOException
+ */
+ public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
+ throws ProtocolException, IOException {
- this.conf = http.getConf();
this.http = http;
this.url = url;
+ this.orig = url.toString();
+ this.base = url.toString();
+
+ Scheme scheme = null;
+
+ if ("http".equals(url.getProtocol())) {
+ scheme = Scheme.HTTP;
+ } else if ("https".equals(url.getProtocol())) {
+ scheme = Scheme.HTTPS;
+ } else {
+ throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+ }
+
+ if (Http.LOG.isTraceEnabled()) {
+ Http.LOG.trace("fetching " + url);
+ }
- LOG.info("fetching {}", url);
-
String path = "".equals(url.getFile()) ? "/" : url.getFile();
// some servers will redirect a request with a host line like
// "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
// don't want the :80...
+
String host = url.getHost();
int port;
String portString;
if (url.getPort() == -1) {
- port = 80;
+ if (scheme == Scheme.HTTP) {
+ port = 80;
+ } else {
+ port = 443;
+ }
portString = "";
} else {
port = url.getPort();
portString = ":" + port;
}
-
Socket socket = null;
try {
@@ -87,6 +126,38 @@ public class HttpResponse implements Response {
InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
+ if (scheme == Scheme.HTTPS) {
+ SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+ .getDefault();
+ SSLSocket sslsocket = (SSLSocket) factory
+ .createSocket(socket, sockHost, sockPort, true);
+ sslsocket.setUseClientMode(true);
+
+ // Get the protocols and ciphers supported by this JVM
+ Set<String> protocols = new HashSet<String>(
+ Arrays.asList(sslsocket.getSupportedProtocols()));
+ Set<String> ciphers = new HashSet<String>(
+ Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+ // Intersect with preferred protocols and ciphers
+ protocols.retainAll(http.getTlsPreferredProtocols());
+ ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+ sslsocket.setEnabledProtocols(
+ protocols.toArray(new String[protocols.size()]));
+ sslsocket.setEnabledCipherSuites(
+ ciphers.toArray(new String[ciphers.size()]));
+
+ sslsocket.startHandshake();
+ socket = sslsocket;
+ }
+
+ this.conf = http.getConf();
+ if (sockAddr != null
+ && conf.getBoolean("store.ip.address", false) == true) {
+ headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+ }
+
// make request
OutputStream req = socket.getOutputStream();
@@ -97,7 +168,6 @@ public class HttpResponse implements Response {
reqStr.append(path);
}
- // TODO: Write code for Https
reqStr.append(" HTTP/1.0\r\n");
reqStr.append("Host: ");
@@ -126,91 +196,262 @@ public class HttpResponse implements Response {
reqStr.append(this.http.getAccept());
reqStr.append("\r\n");
- if (datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+ if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+ reqStr.append("If-Modified-Since: " + HttpDateFormat
+ .toString(datum.getModifiedTime()));
reqStr.append("\r\n");
}
reqStr.append("\r\n");
+ // store the request in the metadata?
+ if (conf.getBoolean("store.http.request", false) == true) {
+ headers.add("_request_", reqStr.toString());
+ }
+
byte[] reqBytes = reqStr.toString().getBytes();
req.write(reqBytes);
req.flush();
PushbackInputStream in = // process response
- new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
- Http.BUFFER_SIZE);
+ new PushbackInputStream(
+ new BufferedInputStream(socket.getInputStream(),
+ Http.BUFFER_SIZE), Http.BUFFER_SIZE);
StringBuffer line = new StringBuffer();
+ // store the http headers verbatim
+ if (conf.getBoolean("store.http.headers", false) == true) {
+ httpHeaders = new StringBuffer();
+ }
+
+ headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+
boolean haveSeenNonContinueStatus = false;
while (!haveSeenNonContinueStatus) {
// parse status code line
this.code = parseStatusLine(in, line);
+ if (httpHeaders != null)
+ httpHeaders.append(line).append("\n");
// parse headers
- parseHeaders(in, line);
+ parseHeaders(in, line, httpHeaders);
haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
}
// Get Content type header
String contentType = getHeader(Response.CONTENT_TYPE);
- // handle with Selenium only if content type in HTML or XHTML
+ // handle with HtmlUnit only if content type in HTML or XHTML
if (contentType != null) {
if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
- readPlainContent(url);
+ readContentFromHtmlUnit(url);
} else {
- try {
- int contentLength = Integer.MAX_VALUE;
- String contentLengthString = headers.get(Response.CONTENT_LENGTH);
- if (contentLengthString != null) {
- try {
- contentLength = Integer.parseInt(contentLengthString.trim());
- } catch (NumberFormatException ex) {
- throw new HttpException("bad content length: " + contentLengthString);
- }
- }
+ String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+ if (transferEncoding != null && "chunked"
+ .equalsIgnoreCase(transferEncoding.trim())) {
+ readChunkedContent(in, line);
+ } else {
+ readPlainContent(in);
+ }
- if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
- contentLength = http.getMaxContent();
+ String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+ if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+ content = http.processGzipEncoded(content, url);
+ } else if ("deflate".equals(contentEncoding)) {
+ content = http.processDeflateEncoded(content, url);
+ } else {
+ // store the headers verbatim only if the response was not compressed
+ // as the content length reported with not match otherwise
+ if (httpHeaders != null) {
+ headers.add("_response.headers_", httpHeaders.toString());
}
-
- byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
- int bufferFilled = 0;
- int totalRead = 0;
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
- && totalRead + bufferFilled <= contentLength) {
- totalRead += bufferFilled;
- out.write(buffer, 0, bufferFilled);
- }
-
- content = out.toByteArray();
-
- } catch (Exception e) {
- if (code == 200)
- throw new IOException(e.toString());
- // for codes other than 200 OK, we are fine with empty content
- } finally {
- if (in != null) {
- in.close();
+ if (Http.LOG.isTraceEnabled()) {
+ Http.LOG.trace("fetched " + content.length + " bytes from " + url);
}
}
}
- }
+ }
} finally {
if (socket != null)
socket.close();
}
+
}
-
- private void readPlainContent(URL url) throws IOException {
+
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
+
+ public URL getUrl() {
+ return url;
+ }
+
+ public int getCode() {
+ return code;
+ }
+
+ public String getHeader(String name) {
+ return headers.get(name);
+ }
+
+ public Metadata getHeaders() {
+ return headers;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ /*
+ * ------------------------- * <implementation:Response> *
+ * -------------------------
+ */
+
+ private void readContentFromHtmlUnit(URL url) throws IOException {
String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
content = page.getBytes("UTF-8");
}
+
+ private void readPlainContent(InputStream in)
+ throws HttpException, IOException {
+
+ int contentLength = Integer.MAX_VALUE; // get content length
+ String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+ if (contentLengthString != null) {
+ contentLengthString = contentLengthString.trim();
+ try {
+ if (!contentLengthString.isEmpty())
+ contentLength = Integer.parseInt(contentLengthString);
+ } catch (NumberFormatException e) {
+ throw new HttpException("bad content length: " + contentLengthString);
+ }
+ }
+ if (http.getMaxContent() >= 0 && contentLength > http
+ .getMaxContent()) // limit
+ // download
+ // size
+ contentLength = http.getMaxContent();
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+ byte[] bytes = new byte[Http.BUFFER_SIZE];
+ int length = 0;
+
+ // do not try to read if the contentLength is 0
+ if (contentLength == 0) {
+ content = new byte[0];
+ return;
+ }
+
+ // read content
+ int i = in.read(bytes);
+ while (i != -1) {
+ out.write(bytes, 0, i);
+ length += i;
+ if (length >= contentLength) {
+ break;
+ }
+ if ((length + Http.BUFFER_SIZE) > contentLength) {
+ // reading next chunk may hit contentLength,
+ // must limit number of bytes read
+ i = in.read(bytes, 0, (contentLength - length));
+ } else {
+ i = in.read(bytes);
+ }
+ }
+ content = out.toByteArray();
+ }
+
+ /**
+ * @param in
+ * @param line
+ * @throws HttpException
+ * @throws IOException
+ */
+ private void readChunkedContent(PushbackInputStream in, StringBuffer line)
+ throws HttpException, IOException {
+ boolean doneChunks = false;
+ int contentBytesRead = 0;
+ byte[] bytes = new byte[Http.BUFFER_SIZE];
+ ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+
+ while (!doneChunks) {
+ if (Http.LOG.isTraceEnabled()) {
+ Http.LOG.trace("Http: starting chunk");
+ }
+
+ readLine(in, line, false);
+
+ String chunkLenStr;
+ // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
+ // }
+
+ int pos = line.indexOf(";");
+ if (pos < 0) {
+ chunkLenStr = line.toString();
+ } else {
+ chunkLenStr = line.substring(0, pos);
+ // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
+ // line.substring(pos+1)); }
+ }
+ chunkLenStr = chunkLenStr.trim();
+ int chunkLen;
+ try {
+ chunkLen = Integer.parseInt(chunkLenStr, 16);
+ } catch (NumberFormatException e) {
+ throw new HttpException("bad chunk length: " + line.toString());
+ }
+
+ if (chunkLen == 0) {
+ doneChunks = true;
+ break;
+ }
+
+ if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
+ .getMaxContent())
+ chunkLen = http.getMaxContent() - contentBytesRead;
+
+ // read one chunk
+ int chunkBytesRead = 0;
+ while (chunkBytesRead < chunkLen) {
+
+ int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+ (chunkLen - chunkBytesRead) :
+ Http.BUFFER_SIZE;
+ int len = in.read(bytes, 0, toRead);
+
+ if (len == -1)
+ throw new HttpException("chunk eof after " + contentBytesRead
+ + " bytes in successful chunks" + " and " + chunkBytesRead
+ + " in current chunk");
+
+ // DANGER!!! Will printed GZIPed stuff right to your
+ // terminal!
+ // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
+ // len)); }
+
+ out.write(bytes, 0, len);
+ chunkBytesRead += len;
+ }
- private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ readLine(in, line, false);
+
+ }
+
+ if (!doneChunks) {
+ if (contentBytesRead != http.getMaxContent())
+ throw new HttpException("chunk eof: !doneChunk && didn't max out");
+ return;
+ }
+
+ content = out.toByteArray();
+ parseHeaders(in, line, null);
+
+ }
+
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+ throws IOException, HttpException {
readLine(in, line, false);
int codeStart = line.indexOf(" ");
@@ -225,13 +466,15 @@ public class HttpResponse implements Response {
try {
code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
} catch (NumberFormatException e) {
- throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+ throw new HttpException(
+ "bad status line '" + line + "': " + e.getMessage(), e);
}
return code;
}
- private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+ private void processHeaderLine(StringBuffer line)
+ throws IOException, HttpException {
int colonIndex = line.indexOf(":"); // key is up to colon
if (colonIndex == -1) {
@@ -257,24 +500,29 @@ public class HttpResponse implements Response {
}
// Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ private void parseHeaders(PushbackInputStream in, StringBuffer line,
+ StringBuffer httpHeaders) throws IOException, HttpException {
while (readLine(in, line, true) != 0) {
+ if (httpHeaders != null)
+ httpHeaders.append(line).append("\n");
+
// handle HTTP responses with missing blank line after headers
int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
- || ((pos = line.indexOf("<html")) != -1)) {
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
+ (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
+ != -1)) {
in.unread(line.substring(pos).getBytes("UTF-8"));
line.setLength(pos);
try {
- //TODO: (CM) We don't know the header names here
- //since we're just handling them generically. It would
- //be nice to provide some sort of mapping function here
- //for the returned header names to the standard metadata
- //names in the ParseData class
+ // TODO: (CM) We don't know the header names here
+ // since we're just handling them generically. It would
+ // be nice to provide some sort of mapping function here
+ // for the returned header names to the standard metadata
+ // names in the ParseData class
processHeaderLine(line);
} catch (Exception e) {
// fixme:
@@ -287,8 +535,8 @@ public class HttpResponse implements Response {
}
}
- private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
- throws IOException {
+ private static int readLine(PushbackInputStream in, StringBuffer line,
+ boolean allowContinuedLine) throws IOException {
line.setLength(0);
for (int c = in.read(); c != -1; c = in.read()) {
switch (c) {
@@ -322,26 +570,4 @@ public class HttpResponse implements Response {
return value;
}
- public URL getUrl() {
- return url;
- }
-
- public String getHeader(String name) {
- return headers.get(name);
- }
-
- public Metadata getHeaders() {
- return headers;
- }
-
- public byte[] getContent() {
- return content;
- }
-
- @Override
- public int getCode() {
- // TODO Auto-generated method stub
- return code;
- }
}
-