You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:49 UTC
[05/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
deleted file mode 100644
index f7c7c6d..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ /dev/null
@@ -1,521 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import org.apache.commons.net.ftp.FTP;
-import org.apache.commons.net.ftp.FTPFile;
-import org.apache.commons.net.ftp.FTPReply;
-import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
-import org.apache.commons.net.ftp.parser.ParserInitializationException;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.hadoop.conf.Configuration;
-
-import java.net.InetAddress;
-import java.net.URL;
-import java.util.List;
-import java.util.LinkedList;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-
-/**
- * FtpResponse.java mimics ftp replies as http response. It tries its best to
- * follow http's way for headers, response codes as well as exceptions.
- *
- * Comments: In this class, all FtpException*.java thrown by Client.java and
- * some important commons-net exceptions passed by Client.java must have been
- * properly dealt with. They'd better not be leaked to the caller of this class.
- */
-public class FtpResponse {
-
- private String orig;
- private String base;
- private byte[] content;
- private static final byte[] EMPTY_CONTENT = new byte[0];
- private int code;
- private Metadata headers = new Metadata();
-
- private final Ftp ftp;
- private Configuration conf;
-
- /** Returns the response code. */
- public int getCode() {
- return code;
- }
-
- /** Returns the value of a named header. */
- public String getHeader(String name) {
- return headers.get(name);
- }
-
- public byte[] getContent() {
- return content;
- }
-
- public Content toContent() {
- return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
- getHeader(Response.CONTENT_TYPE), headers, this.conf);
- }
-
- public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
- throws FtpException, IOException {
-
- this.orig = url.toString();
- this.base = url.toString();
- this.ftp = ftp;
- this.conf = conf;
-
- if (!"ftp".equals(url.getProtocol()))
- throw new FtpException("Not a ftp url:" + url);
-
- if (url.getPath() != url.getFile()) {
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
- }
- }
-
- String path = "".equals(url.getPath()) ? "/" : url.getPath();
-
- try {
-
- if (ftp.followTalk) {
- if (Ftp.LOG.isInfoEnabled()) {
- Ftp.LOG.info("fetching " + url);
- }
- } else {
- if (Ftp.LOG.isTraceEnabled()) {
- Ftp.LOG.trace("fetching " + url);
- }
- }
-
- InetAddress addr = InetAddress.getByName(url.getHost());
- if (addr != null && conf.getBoolean("store.ip.address", false) == true) {
- headers.add("_ip_", addr.getHostAddress());
- }
-
- // idled too long, remote server or ourselves may have timed out,
- // should start anew.
- if (ftp.client != null && ftp.keepConnection
- && ftp.renewalTime < System.currentTimeMillis()) {
- if (Ftp.LOG.isInfoEnabled()) {
- Ftp.LOG.info("delete client because idled too long");
- }
- ftp.client = null;
- }
-
- // start anew if needed
- if (ftp.client == null) {
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("start client");
- }
- // the real client
- ftp.client = new Client();
- // when to renew, take the lesser
- // ftp.renewalTime = System.currentTimeMillis()
- // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout :
- // ftp.serverTimeout);
-
- // timeout for control connection
- ftp.client.setDefaultTimeout(ftp.timeout);
- // timeout for data connection
- ftp.client.setDataTimeout(ftp.timeout);
-
- // follow ftp talk?
- if (ftp.followTalk)
- ftp.client.addProtocolCommandListener(new PrintCommandListener(
- Ftp.LOG));
- }
-
- // quit from previous site if at a different site now
- if (ftp.client.isConnected()) {
- InetAddress remoteAddress = ftp.client.getRemoteAddress();
- if (!addr.equals(remoteAddress)) {
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("disconnect from " + remoteAddress
- + " before connect to " + addr);
- }
- // quit from current site
- ftp.client.logout();
- ftp.client.disconnect();
- }
- }
-
- // connect to current site if needed
- if (!ftp.client.isConnected()) {
-
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("connect to " + addr);
- }
-
- ftp.client.connect(addr);
- if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
- ftp.client.disconnect();
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " "
- + ftp.client.getReplyString());
- }
- this.code = 500; // http Internal Server Error
- return;
- }
-
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("log into " + addr);
- }
-
- if (!ftp.client.login(ftp.userName, ftp.passWord)) {
- // login failed.
- // please note that some server may return 421 immediately
- // after USER anonymous, thus ftp.client.login() won't return false,
- // but throw exception, which then will be handled by caller
- // (not dealt with here at all) .
- ftp.client.disconnect();
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("ftp.client.login() failed: " + addr);
- }
- this.code = 401; // http Unauthorized
- return;
- }
-
- // insist on binary file type
- if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
- ftp.client.logout();
- ftp.client.disconnect();
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr);
- }
- this.code = 500; // http Internal Server Error
- return;
- }
-
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("set parser for " + addr);
- }
-
- // SYST is valid only after login
- try {
- ftp.parser = null;
- String parserKey = ftp.client.getSystemName();
- // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
- if (parserKey.startsWith("UNKNOWN Type: L8"))
- parserKey = "UNIX Type: L8";
- ftp.parser = (new DefaultFTPFileEntryParserFactory())
- .createFileEntryParser(parserKey);
- } catch (FtpExceptionBadSystResponse e) {
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG
- .warn("ftp.client.getSystemName() failed: " + addr + " " + e);
- }
- ftp.parser = null;
- } catch (ParserInitializationException e) {
- // ParserInitializationException is RuntimeException defined in
- // org.apache.commons.net.ftp.parser.ParserInitializationException
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e);
- }
- ftp.parser = null;
- } finally {
- if (ftp.parser == null) {
- // do not log as severe, otherwise
- // FetcherThread/RequestScheduler will abort
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("ftp.parser is null: " + addr);
- }
- ftp.client.logout();
- ftp.client.disconnect();
- this.code = 500; // http Internal Server Error
- return;
- }
- }
-
- } else {
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("use existing connection");
- }
- }
-
- this.content = null;
-
- if (path.endsWith("/")) {
- getDirAsHttpResponse(path, datum.getModifiedTime());
- } else {
- getFileAsHttpResponse(path, datum.getModifiedTime());
- }
-
- // reset next renewalTime, take the lesser
- if (ftp.client != null && ftp.keepConnection) {
- ftp.renewalTime = System.currentTimeMillis()
- + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout
- : ftp.serverTimeout);
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("reset renewalTime to "
- + HttpDateFormat.toString(ftp.renewalTime));
- }
- }
-
- // getDirAsHttpResponse() or getFileAsHttpResponse() above
- // may have deleted ftp.client
- if (ftp.client != null && !ftp.keepConnection) {
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("disconnect from " + addr);
- }
- ftp.client.logout();
- ftp.client.disconnect();
- }
-
- } catch (Exception e) {
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("Error: ", e);
- }
- // for any un-foreseen exception (run time exception or not),
- // do ultimate clean and leave ftp.client for garbage collection
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("delete client due to exception");
- }
- ftp.client = null;
- // or do explicit garbage collection?
- // System.gc();
- // can we be less dramatic, using the following instead?
- // probably unnecessary for our practical purpose here
- // try {
- // ftp.client.logout();
- // ftp.client.disconnect();
- // }
- throw new FtpException(e);
- // throw e;
- }
-
- }
-
- // get ftp file as http response
- private void getFileAsHttpResponse(String path, long lastModified)
- throws IOException {
-
- ByteArrayOutputStream os = null;
- List<FTPFile> list = null;
-
- try {
- // first get its possible attributes
- list = new LinkedList<FTPFile>();
- ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
-
- FTPFile ftpFile = (FTPFile) list.get(0);
- this.headers.set(Response.CONTENT_LENGTH,
- new Long(ftpFile.getSize()).toString());
- this.headers.set(Response.LAST_MODIFIED,
- HttpDateFormat.toString(ftpFile.getTimestamp()));
- // don't retrieve the file if not changed.
- if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
- code = 304;
- return;
- }
- os = new ByteArrayOutputStream(ftp.getBufferSize());
- ftp.client.retrieveFile(path, os, ftp.maxContentLength);
-
- this.content = os.toByteArray();
-
- // // approximate bytes sent and read
- // if (this.httpAccounting != null) {
- // this.httpAccounting.incrementBytesSent(path.length());
- // this.httpAccounting.incrementBytesRead(this.content.length);
- // }
-
- this.code = 200; // http OK
-
- } catch (FtpExceptionControlClosedByForcedDataClose e) {
-
- // control connection is off, clean up
- // ftp.client.disconnect();
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("delete client because server cut off control channel: "
- + e);
- }
- ftp.client = null;
-
- // in case this FtpExceptionControlClosedByForcedDataClose is
- // thrown by retrieveList() (not retrieveFile()) above,
- if (os == null) { // indicating throwing by retrieveList()
- // throw new FtpException("fail to get attibutes: "+path);
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG
- .warn("Please try larger maxContentLength for ftp.client.retrieveList(). "
- + e);
- }
- // in a way, this is our request fault
- this.code = 400; // http Bad request
- return;
- }
-
- FTPFile ftpFile = (FTPFile) list.get(0);
- this.headers.set(Response.CONTENT_LENGTH,
- new Long(ftpFile.getSize()).toString());
- // this.headers.put("content-type", "text/html");
- this.headers.set(Response.LAST_MODIFIED,
- HttpDateFormat.toString(ftpFile.getTimestamp()));
- this.content = os.toByteArray();
- if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
- code = 304;
- return;
- }
-
- // // approximate bytes sent and read
- // if (this.httpAccounting != null) {
- // this.httpAccounting.incrementBytesSent(path.length());
- // this.httpAccounting.incrementBytesRead(this.content.length);
- // }
-
- this.code = 200; // http OK
-
- } catch (FtpExceptionCanNotHaveDataConnection e) {
-
- if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
- // it is not a file, but dir, so redirect as a dir
- this.headers.set(Response.LOCATION, path + "/");
- this.code = 300; // http redirect
- // fixme, should we do ftp.client.cwd("/"), back to top dir?
- } else {
- // it is not a dir either
- this.code = 404; // http Not Found
- }
-
- } catch (FtpExceptionUnknownForcedDataClose e) {
- // Please note control channel is still live.
- // in a way, this is our request fault
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
- + "If this is acceptable, please modify Client.java accordingly. "
- + e);
- }
- this.code = 400; // http Bad Request
- }
-
- }
-
- // get ftp dir list as http response
- private void getDirAsHttpResponse(String path, long lastModified)
- throws IOException {
- List<FTPFile> list = new LinkedList<FTPFile>();
-
- try {
-
- // change to that dir first
- if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
- this.code = 404; // http Not Found
- return;
- }
-
- // fixme, should we do ftp.client.cwd("/"), back to top dir?
-
- ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
- this.content = list2html(list, path, "/".equals(path) ? false : true);
- this.headers.set(Response.CONTENT_LENGTH,
- new Integer(this.content.length).toString());
- this.headers.set(Response.CONTENT_TYPE, "text/html");
- // this.headers.put("Last-Modified", null);
-
- // // approximate bytes sent and read
- // if (this.httpAccounting != null) {
- // this.httpAccounting.incrementBytesSent(path.length());
- // this.httpAccounting.incrementBytesRead(this.content.length);
- // }
-
- this.code = 200; // http OK
-
- } catch (FtpExceptionControlClosedByForcedDataClose e) {
-
- // control connection is off, clean up
- // ftp.client.disconnect();
- if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
- Ftp.LOG.info("delete client because server cut off control channel: "
- + e);
- }
- ftp.client = null;
-
- this.content = list2html(list, path, "/".equals(path) ? false : true);
- this.headers.set(Response.CONTENT_LENGTH,
- new Integer(this.content.length).toString());
- this.headers.set(Response.CONTENT_TYPE, "text/html");
- // this.headers.put("Last-Modified", null);
-
- // // approximate bytes sent and read
- // if (this.httpAccounting != null) {
- // this.httpAccounting.incrementBytesSent(path.length());
- // this.httpAccounting.incrementBytesRead(this.content.length);
- // }
-
- this.code = 200; // http OK
-
- } catch (FtpExceptionUnknownForcedDataClose e) {
- // Please note control channel is still live.
- // in a way, this is our request fault
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
- + "If this is acceptable, please modify Client.java accordingly. "
- + e);
- }
- this.code = 400; // http Bad Request
- } catch (FtpExceptionCanNotHaveDataConnection e) {
- if (Ftp.LOG.isWarnEnabled()) {
- Ftp.LOG.warn("" + e);
- }
- this.code = 500; // http Iternal Server Error
- }
-
- }
-
- // generate html page from ftp dir list
- private byte[] list2html(List<FTPFile> list, String path,
- boolean includeDotDot) {
-
- // StringBuffer x = new
- // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
- StringBuffer x = new StringBuffer("<html><head>");
- x.append("<title>Index of " + path + "</title></head>\n");
- x.append("<body><h1>Index of " + path + "</h1><pre>\n");
-
- if (includeDotDot) {
- x.append("<a href='../'>../</a>\t-\t-\t-\n");
- }
-
- for (int i = 0; i < list.size(); i++) {
- FTPFile f = (FTPFile) list.get(i);
- String name = f.getName();
- String time = HttpDateFormat.toString(f.getTimestamp());
- if (f.isDirectory()) {
- // some ftp server LIST "." and "..", we skip them here
- if (name.equals(".") || name.equals(".."))
- continue;
- x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
- x.append(time + "\t-\n");
- } else if (f.isFile()) {
- x.append("<a href='" + name + "'>" + name + "</a>\t");
- x.append(time + "\t" + f.getSize() + "\n");
- } else {
- // ignore isSymbolicLink()
- // ignore isUnknown()
- }
- }
-
- x.append("</pre></body></html>\n");
-
- return new String(x).getBytes();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
deleted file mode 100644
index 3764864..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRulesParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import crawlercommons.robots.BaseRobotRules;
-import crawlercommons.robots.SimpleRobotRules;
-
-/**
- * This class is used for parsing robots for urls belonging to FTP protocol. It
- * extends the generic {@link RobotRulesParser} class and contains Ftp protocol
- * specific implementation for obtaining the robots file.
- */
-public class FtpRobotRulesParser extends RobotRulesParser {
-
- private static final String CONTENT_TYPE = "text/plain";
- public static final Logger LOG = LoggerFactory
- .getLogger(FtpRobotRulesParser.class);
-
- FtpRobotRulesParser() {
- }
-
- public FtpRobotRulesParser(Configuration conf) {
- super(conf);
- }
-
- /**
- * The hosts for which the caching of robots rules is yet to be done, it sends
- * a Ftp request to the host corresponding to the {@link URL} passed, gets
- * robots file, parses the rules and caches the rules object to avoid re-work
- * in future.
- *
- * @param ftp
- * The {@link Protocol} object
- * @param url
- * URL
- *
- * @return robotRules A {@link BaseRobotRules} object for the rules
- */
- public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
-
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower
- // case
- String host = url.getHost().toLowerCase(); // normalize to lower case
-
- if (LOG.isTraceEnabled() && isWhiteListed(url)) {
- LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
- }
-
- BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
-
- if (robotRules != null) {
- return robotRules; // cached rule
- } else if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss " + url);
- }
-
- boolean cacheRule = true;
-
- if (isWhiteListed(url)) {
- // check in advance whether a host is whitelisted
- // (we do not need to fetch robots.txt)
- robotRules = EMPTY_RULES;
- LOG.info("Whitelisted host found for: {}", url);
- LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
-
- } else {
- try {
- Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
- ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
- new CrawlDatum());
- ProtocolStatus status = output.getStatus();
-
- if (status.getCode() == ProtocolStatus.SUCCESS) {
- robotRules = parseRules(url.toString(), output.getContent()
- .getContent(), CONTENT_TYPE, agentNames);
- } else {
- robotRules = EMPTY_RULES; // use default rules
- }
- } catch (Throwable t) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
- }
- cacheRule = false; // try again later to fetch robots.txt
- robotRules = EMPTY_RULES;
- }
-
- }
-
- if (cacheRule)
- CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
-
- return robotRules;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
deleted file mode 100644
index c68eac8..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.io.IOException;
-
-import org.slf4j.Logger;
-
-import org.apache.commons.net.ProtocolCommandEvent;
-import org.apache.commons.net.ProtocolCommandListener;
-
-/***
- * This is a support class for logging all ftp command/reply traffic.
- *
- * @author John Xing
- ***/
-public class PrintCommandListener implements ProtocolCommandListener {
- private Logger __logger;
-
- public PrintCommandListener(Logger logger) {
- __logger = logger;
- }
-
- public void protocolCommandSent(ProtocolCommandEvent event) {
- try {
- __logIt(event);
- } catch (IOException e) {
- if (__logger.isInfoEnabled()) {
- __logger.info("PrintCommandListener.protocolCommandSent(): " + e);
- }
- }
- }
-
- public void protocolReplyReceived(ProtocolCommandEvent event) {
- try {
- __logIt(event);
- } catch (IOException e) {
- if (__logger.isInfoEnabled()) {
- __logger.info("PrintCommandListener.protocolReplyReceived(): " + e);
- }
- }
- }
-
- private void __logIt(ProtocolCommandEvent event) throws IOException {
- if (!__logger.isInfoEnabled()) {
- return;
- }
- BufferedReader br = new BufferedReader(new StringReader(event.getMessage()));
- String line;
- while ((line = br.readLine()) != null) {
- __logger.info("ftp> " + line);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html
deleted file mode 100644
index d936930..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml
deleted file mode 100644
index 899214c..0000000
--- a/src/plugin/protocol-htmlunit/build.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-htmlunit" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-http"/>
- <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
- </target>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-http/*.jar" />
- <include name="**/lib-htmlunit/*.jar" />
- </fileset>
- <pathelement location="${build.dir}/test/conf"/>
- </path>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/ivy.xml b/src/plugin/protocol-htmlunit/ivy.xml
deleted file mode 100644
index 8aa78d2..0000000
--- a/src/plugin/protocol-htmlunit/ivy.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/plugin.xml b/src/plugin/protocol-htmlunit/plugin.xml
deleted file mode 100644
index 36bcb80..0000000
--- a/src/plugin/protocol-htmlunit/plugin.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="protocol-htmlunit"
- name="HtmlUnit Protocol Plug-in"
- version="1.0.0"
- provider-name="nutch.apache.org">
-
- <runtime>
- <library name="protocol-htmlunit.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- <import plugin="lib-http"/>
- <import plugin="lib-htmlunit"/>
- </requires>
-
- <extension id="org.apache.nutch.protocol.http"
- name="HttpProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
- <implementation id="org.apache.nutch.protocol.htmlunit.Http"
- class="org.apache.nutch.protocol.htmlunit.Http">
- <parameter name="protocolName" value="http"/>
- </implementation>
-
- <implementation id="org.apache.nutch.protocol.htmlunit.Http"
- class="org.apache.nutch.protocol.htmlunit.Http">
- <parameter name="protocolName" value="https"/>
- </implementation>
-
- </extension>
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
deleted file mode 100644
index c40ed69..0000000
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.htmlunit;
-
-import java.io.IOException;
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class Http extends HttpBase {
-
- public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
- /**
- * Default constructor.
- */
- public Http() {
- super(LOG);
- }
-
- /**
- * Set the {@link org.apache.hadoop.conf.Configuration} object.
- *
- * @param conf
- */
- public void setConf(Configuration conf) {
- super.setConf(conf);
- }
-
- public static void main(String[] args) throws Exception {
- Http http = new Http();
- http.setConf(NutchConfiguration.create());
- main(http, args);
- }
-
- protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
- throws ProtocolException, IOException {
- return new HttpResponse(this, url, datum);
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
deleted file mode 100644
index 8b1a031..0000000
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ /dev/null
@@ -1,573 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.htmlunit;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PushbackInputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.net.ssl.SSLSocket;
-import javax.net.ssl.SSLSocketFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.protocol.http.api.HttpException;
-
-/**
- * An HTTP response.
- */
-public class HttpResponse implements Response {
-
- private Configuration conf;
- private HttpBase http;
- private URL url;
- private String orig;
- private String base;
- private byte[] content;
- private int code;
- private Metadata headers = new SpellCheckedMetadata();
- // used for storing the http headers verbatim
- private StringBuffer httpHeaders;
-
- protected enum Scheme {
- HTTP, HTTPS,
- }
-
- /**
- * Default public constructor.
- *
- * @param http
- * @param url
- * @param datum
- * @throws ProtocolException
- * @throws IOException
- */
- public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
- throws ProtocolException, IOException {
-
- this.http = http;
- this.url = url;
- this.orig = url.toString();
- this.base = url.toString();
-
- Scheme scheme = null;
-
- if ("http".equals(url.getProtocol())) {
- scheme = Scheme.HTTP;
- } else if ("https".equals(url.getProtocol())) {
- scheme = Scheme.HTTPS;
- } else {
- throw new HttpException("Unknown scheme (not http/https) for url:" + url);
- }
-
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetching " + url);
- }
-
- String path = "".equals(url.getFile()) ? "/" : url.getFile();
-
- // some servers will redirect a request with a host line like
- // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
- // don't want the :80...
-
- String host = url.getHost();
- int port;
- String portString;
- if (url.getPort() == -1) {
- if (scheme == Scheme.HTTP) {
- port = 80;
- } else {
- port = 443;
- }
- portString = "";
- } else {
- port = url.getPort();
- portString = ":" + port;
- }
- Socket socket = null;
-
- try {
- socket = new Socket(); // create the socket
- socket.setSoTimeout(http.getTimeout());
-
- // connect
- String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
- int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
- InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
- socket.connect(sockAddr, http.getTimeout());
-
- if (scheme == Scheme.HTTPS) {
- SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
- .getDefault();
- SSLSocket sslsocket = (SSLSocket) factory
- .createSocket(socket, sockHost, sockPort, true);
- sslsocket.setUseClientMode(true);
-
- // Get the protocols and ciphers supported by this JVM
- Set<String> protocols = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedProtocols()));
- Set<String> ciphers = new HashSet<String>(
- Arrays.asList(sslsocket.getSupportedCipherSuites()));
-
- // Intersect with preferred protocols and ciphers
- protocols.retainAll(http.getTlsPreferredProtocols());
- ciphers.retainAll(http.getTlsPreferredCipherSuites());
-
- sslsocket.setEnabledProtocols(
- protocols.toArray(new String[protocols.size()]));
- sslsocket.setEnabledCipherSuites(
- ciphers.toArray(new String[ciphers.size()]));
-
- sslsocket.startHandshake();
- socket = sslsocket;
- }
-
- this.conf = http.getConf();
- if (sockAddr != null
- && conf.getBoolean("store.ip.address", false) == true) {
- headers.add("_ip_", sockAddr.getAddress().getHostAddress());
- }
-
- // make request
- OutputStream req = socket.getOutputStream();
-
- StringBuffer reqStr = new StringBuffer("GET ");
- if (http.useProxy(url)) {
- reqStr.append(url.getProtocol() + "://" + host + portString + path);
- } else {
- reqStr.append(path);
- }
-
- reqStr.append(" HTTP/1.0\r\n");
-
- reqStr.append("Host: ");
- reqStr.append(host);
- reqStr.append(portString);
- reqStr.append("\r\n");
-
- reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
-
- String userAgent = http.getUserAgent();
- if ((userAgent == null) || (userAgent.length() == 0)) {
- if (Http.LOG.isErrorEnabled()) {
- Http.LOG.error("User-agent is not set!");
- }
- } else {
- reqStr.append("User-Agent: ");
- reqStr.append(userAgent);
- reqStr.append("\r\n");
- }
-
- reqStr.append("Accept-Language: ");
- reqStr.append(this.http.getAcceptLanguage());
- reqStr.append("\r\n");
-
- reqStr.append("Accept: ");
- reqStr.append(this.http.getAccept());
- reqStr.append("\r\n");
-
- if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
- reqStr.append("If-Modified-Since: " + HttpDateFormat
- .toString(datum.getModifiedTime()));
- reqStr.append("\r\n");
- }
- reqStr.append("\r\n");
-
- // store the request in the metadata?
- if (conf.getBoolean("store.http.request", false) == true) {
- headers.add("_request_", reqStr.toString());
- }
-
- byte[] reqBytes = reqStr.toString().getBytes();
-
- req.write(reqBytes);
- req.flush();
-
- PushbackInputStream in = // process response
- new PushbackInputStream(
- new BufferedInputStream(socket.getInputStream(),
- Http.BUFFER_SIZE), Http.BUFFER_SIZE);
-
- StringBuffer line = new StringBuffer();
-
- // store the http headers verbatim
- if (conf.getBoolean("store.http.headers", false) == true) {
- httpHeaders = new StringBuffer();
- }
-
- headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
-
- boolean haveSeenNonContinueStatus = false;
- while (!haveSeenNonContinueStatus) {
- // parse status code line
- this.code = parseStatusLine(in, line);
- if (httpHeaders != null)
- httpHeaders.append(line).append("\n");
- // parse headers
- parseHeaders(in, line, httpHeaders);
- haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
- }
-
- // Get Content type header
- String contentType = getHeader(Response.CONTENT_TYPE);
-
- // handle with HtmlUnit only if content type in HTML or XHTML
- if (contentType != null) {
- if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
- readContentFromHtmlUnit(url);
- } else {
- String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
- if (transferEncoding != null && "chunked"
- .equalsIgnoreCase(transferEncoding.trim())) {
- readChunkedContent(in, line);
- } else {
- readPlainContent(in);
- }
-
- String contentEncoding = getHeader(Response.CONTENT_ENCODING);
- if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
- content = http.processGzipEncoded(content, url);
- } else if ("deflate".equals(contentEncoding)) {
- content = http.processDeflateEncoded(content, url);
- } else {
- // store the headers verbatim only if the response was not compressed
- // as the content length reported with not match otherwise
- if (httpHeaders != null) {
- headers.add("_response.headers_", httpHeaders.toString());
- }
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("fetched " + content.length + " bytes from " + url);
- }
- }
- }
- }
-
- } finally {
- if (socket != null)
- socket.close();
- }
-
- }
-
- /*
- * ------------------------- * <implementation:Response> *
- * -------------------------
- */
-
- public URL getUrl() {
- return url;
- }
-
- public int getCode() {
- return code;
- }
-
- public String getHeader(String name) {
- return headers.get(name);
- }
-
- public Metadata getHeaders() {
- return headers;
- }
-
- public byte[] getContent() {
- return content;
- }
-
- /*
- * ------------------------- * <implementation:Response> *
- * -------------------------
- */
-
- private void readContentFromHtmlUnit(URL url) throws IOException {
- String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
- content = page.getBytes("UTF-8");
- }
-
- private void readPlainContent(InputStream in)
- throws HttpException, IOException {
-
- int contentLength = Integer.MAX_VALUE; // get content length
- String contentLengthString = headers.get(Response.CONTENT_LENGTH);
- if (contentLengthString != null) {
- contentLengthString = contentLengthString.trim();
- try {
- if (!contentLengthString.isEmpty())
- contentLength = Integer.parseInt(contentLengthString);
- } catch (NumberFormatException e) {
- throw new HttpException("bad content length: " + contentLengthString);
- }
- }
- if (http.getMaxContent() >= 0 && contentLength > http
- .getMaxContent()) // limit
- // download
- // size
- contentLength = http.getMaxContent();
-
- ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
- byte[] bytes = new byte[Http.BUFFER_SIZE];
- int length = 0;
-
- // do not try to read if the contentLength is 0
- if (contentLength == 0) {
- content = new byte[0];
- return;
- }
-
- // read content
- int i = in.read(bytes);
- while (i != -1) {
- out.write(bytes, 0, i);
- length += i;
- if (length >= contentLength) {
- break;
- }
- if ((length + Http.BUFFER_SIZE) > contentLength) {
- // reading next chunk may hit contentLength,
- // must limit number of bytes read
- i = in.read(bytes, 0, (contentLength - length));
- } else {
- i = in.read(bytes);
- }
- }
- content = out.toByteArray();
- }
-
- /**
- * @param in
- * @param line
- * @throws HttpException
- * @throws IOException
- */
- private void readChunkedContent(PushbackInputStream in, StringBuffer line)
- throws HttpException, IOException {
- boolean doneChunks = false;
- int contentBytesRead = 0;
- byte[] bytes = new byte[Http.BUFFER_SIZE];
- ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
-
- while (!doneChunks) {
- if (Http.LOG.isTraceEnabled()) {
- Http.LOG.trace("Http: starting chunk");
- }
-
- readLine(in, line, false);
-
- String chunkLenStr;
- // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
- // }
-
- int pos = line.indexOf(";");
- if (pos < 0) {
- chunkLenStr = line.toString();
- } else {
- chunkLenStr = line.substring(0, pos);
- // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
- // line.substring(pos+1)); }
- }
- chunkLenStr = chunkLenStr.trim();
- int chunkLen;
- try {
- chunkLen = Integer.parseInt(chunkLenStr, 16);
- } catch (NumberFormatException e) {
- throw new HttpException("bad chunk length: " + line.toString());
- }
-
- if (chunkLen == 0) {
- doneChunks = true;
- break;
- }
-
- if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
- .getMaxContent())
- chunkLen = http.getMaxContent() - contentBytesRead;
-
- // read one chunk
- int chunkBytesRead = 0;
- while (chunkBytesRead < chunkLen) {
-
- int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
- (chunkLen - chunkBytesRead) :
- Http.BUFFER_SIZE;
- int len = in.read(bytes, 0, toRead);
-
- if (len == -1)
- throw new HttpException("chunk eof after " + contentBytesRead
- + " bytes in successful chunks" + " and " + chunkBytesRead
- + " in current chunk");
-
- // DANGER!!! Will printed GZIPed stuff right to your
- // terminal!
- // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
- // len)); }
-
- out.write(bytes, 0, len);
- chunkBytesRead += len;
- }
-
- readLine(in, line, false);
-
- }
-
- if (!doneChunks) {
- if (contentBytesRead != http.getMaxContent())
- throw new HttpException("chunk eof: !doneChunk && didn't max out");
- return;
- }
-
- content = out.toByteArray();
- parseHeaders(in, line, null);
-
- }
-
- private int parseStatusLine(PushbackInputStream in, StringBuffer line)
- throws IOException, HttpException {
- readLine(in, line, false);
-
- int codeStart = line.indexOf(" ");
- int codeEnd = line.indexOf(" ", codeStart + 1);
-
- // handle lines with no plaintext result code, ie:
- // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
- if (codeEnd == -1)
- codeEnd = line.length();
-
- int code;
- try {
- code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
- } catch (NumberFormatException e) {
- throw new HttpException(
- "bad status line '" + line + "': " + e.getMessage(), e);
- }
-
- return code;
- }
-
- private void processHeaderLine(StringBuffer line)
- throws IOException, HttpException {
-
- int colonIndex = line.indexOf(":"); // key is up to colon
- if (colonIndex == -1) {
- int i;
- for (i = 0; i < line.length(); i++)
- if (!Character.isWhitespace(line.charAt(i)))
- break;
- if (i == line.length())
- return;
- throw new HttpException("No colon in header:" + line);
- }
- String key = line.substring(0, colonIndex);
-
- int valueStart = colonIndex + 1; // skip whitespace
- while (valueStart < line.length()) {
- int c = line.charAt(valueStart);
- if (c != ' ' && c != '\t')
- break;
- valueStart++;
- }
- String value = line.substring(valueStart);
- headers.set(key, value);
- }
-
- // Adds headers to our headers Metadata
- private void parseHeaders(PushbackInputStream in, StringBuffer line,
- StringBuffer httpHeaders) throws IOException, HttpException {
-
- while (readLine(in, line, true) != 0) {
-
- if (httpHeaders != null)
- httpHeaders.append(line).append("\n");
-
- // handle HTTP responses with missing blank line after headers
- int pos;
- if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
- (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
- != -1)) {
-
- in.unread(line.substring(pos).getBytes("UTF-8"));
- line.setLength(pos);
-
- try {
- // TODO: (CM) We don't know the header names here
- // since we're just handling them generically. It would
- // be nice to provide some sort of mapping function here
- // for the returned header names to the standard metadata
- // names in the ParseData class
- processHeaderLine(line);
- } catch (Exception e) {
- // fixme:
- Http.LOG.warn("Error: ", e);
- }
- return;
- }
-
- processHeaderLine(line);
- }
- }
-
- private static int readLine(PushbackInputStream in, StringBuffer line,
- boolean allowContinuedLine) throws IOException {
- line.setLength(0);
- for (int c = in.read(); c != -1; c = in.read()) {
- switch (c) {
- case '\r':
- if (peek(in) == '\n') {
- in.read();
- }
- case '\n':
- if (line.length() > 0) {
- // at EOL -- check for continued line if the current
- // (possibly continued) line wasn't blank
- if (allowContinuedLine)
- switch (peek(in)) {
- case ' ':
- case '\t': // line is continued
- in.read();
- continue;
- }
- }
- return line.length(); // else complete
- default:
- line.append((char) c);
- }
- }
- throw new EOFException();
- }
-
- private static int peek(PushbackInputStream in) throws IOException {
- int value = in.read();
- in.unread(value);
- return value;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
deleted file mode 100644
index 4181951..0000000
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/build.xml b/src/plugin/protocol-http/build.xml
deleted file mode 100755
index 30720f1..0000000
--- a/src/plugin/protocol-http/build.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-http" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-http"/>
- </target>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-http/*.jar" />
- </fileset>
- <pathelement location="${build.dir}/test/conf"/>
- </path>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../lib-http"/>
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <copy toDir="${build.test}">
- <fileset dir="${src.test}" excludes="**/*.java"/>
- </copy>
- </target>
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="jsp"/>
- </copy>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/ivy.xml b/src/plugin/protocol-http/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/protocol-http/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/basic-http.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/basic-http.jsp b/src/plugin/protocol-http/jsp/basic-http.jsp
deleted file mode 100644
index bf1f8bd..0000000
--- a/src/plugin/protocol-http/jsp/basic-http.jsp
+++ /dev/null
@@ -1,44 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>HelloWorld</title>
- <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- <meta name="Language" content="en" />
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
-
- <body>
- Hello World!!! <br>
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/brokenpage.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/brokenpage.jsp b/src/plugin/protocol-http/jsp/brokenpage.jsp
deleted file mode 100644
index f3f7c4a..0000000
--- a/src/plugin/protocol-http/jsp/brokenpage.jsp
+++ /dev/null
@@ -1,47 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%>
-
-@ page language="java" import="java.util.*" pageEncoding="UTF-8"
-
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>HelloWorld</title>
- <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- <meta name="Language" content="en" />
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
-
- <body>
- Hello World!!! <br>
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/redirect301.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/redirect301.jsp b/src/plugin/protocol-http/jsp/redirect301.jsp
deleted file mode 100644
index 1100b89..0000000
--- a/src/plugin/protocol-http/jsp/redirect301.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>My JSP page</title>
-
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
-
- </head>
-
- <body>
- <%
- response.setStatus(301);
- response.setHeader( "Location", "http://nutch.apache.org");
- response.setHeader( "Connection", "close" );
- %>
- You are redirected by JSP<br>
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/redirect302.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/redirect302.jsp b/src/plugin/protocol-http/jsp/redirect302.jsp
deleted file mode 100644
index 8a250d9..0000000
--- a/src/plugin/protocol-http/jsp/redirect302.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>My JSP page</title>
-
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
-
- </head>
-
- <body>
- <%
- response.setStatus(302);
- response.setHeader( "Location", "http://nutch.apache.org");
- response.setHeader( "Connection", "close" );
- %>
- You are sucessfully redirected by JSP<br>
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/plugin.xml b/src/plugin/protocol-http/plugin.xml
deleted file mode 100755
index 8770b10..0000000
--- a/src/plugin/protocol-http/plugin.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="protocol-http"
- name="Http Protocol Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="protocol-http.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- <import plugin="lib-http"/>
- </requires>
-
- <extension id="org.apache.nutch.protocol.http"
- name="HttpProtocol"
- point="org.apache.nutch.protocol.Protocol">
-
- <implementation id="org.apache.nutch.protocol.http.Http"
- class="org.apache.nutch.protocol.http.Http">
- <parameter name="protocolName" value="http"/>
- </implementation>
-
- <implementation id="org.apache.nutch.protocol.http.Http"
- class="org.apache.nutch.protocol.http.Http">
- <parameter name="protocolName" value="https"/>
- </implementation>
-
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
deleted file mode 100755
index 56f9f4f..0000000
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http;
-
-// JDK imports
-import java.io.IOException;
-import java.net.URL;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.util.NutchConfiguration;
-
-public class Http extends HttpBase {
-
- public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
- /**
- * Public default constructor.
- */
- public Http() {
- super(LOG);
- }
-
- /**
- * Set the {@link org.apache.hadoop.conf.Configuration} object.
- *
- * @param conf
- */
- public void setConf(Configuration conf) {
- super.setConf(conf);
- // Level logLevel = Level.WARNING;
- // if (conf.getBoolean("http.verbose", false)) {
- // logLevel = Level.FINE;
- // }
- // LOG.setLevel(logLevel);
- }
-
- public static void main(String[] args) throws Exception {
- Http http = new Http();
- http.setConf(NutchConfiguration.create());
- main(http, args);
- }
-
- protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
- throws ProtocolException, IOException {
- return new HttpResponse(this, url, datum);
- }
-
-}