You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/05/21 18:40:33 UTC
svn commit: r1341100 - in /nutch/branches/nutchgora: ./ conf/
src/java/org/apache/nutch/metadata/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
src/plugin/protocol-sft...
Author: lewismc
Date: Mon May 21 16:40:32 2012
New Revision: 1341100
URL: http://svn.apache.org/viewvc?rev=1341100&view=rev
Log:
commit to address NUTCH-1360 and update to CHANGES.txt
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/conf/nutch-default.xml
nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java
nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon May 21 16:40:32 2012
@@ -1,6 +1,9 @@
Nutch Change Log
Release nutchgora - Current Development
+
+* NUTCH-1360 Support the storing of IP address connected to when web crawling (lewismc)
+
* NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)
* NUTCH-1362 Fix error handling of urls with empty fields (lewis, ferdy)
Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Mon May 21 16:40:32 2012
@@ -257,6 +257,13 @@
</description>
</property>
+<property>
+ <name>http.store.ip.address</name>
+ <value>false</value>
+ <description>Enables us to capture the specific IP address of the
+ host which we connect to to fetch a page.</description>
+</property>
+
<!-- FTP properties -->
<property>
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java Mon May 21 16:40:32 2012
@@ -46,5 +46,7 @@ public interface HttpHeaders {
public final static String LAST_MODIFIED = "Last-Modified";
public final static String LOCATION = "Location";
+
+ public final static String IP_ADDRESS = "_ip";
}
Modified: nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon May 21 16:40:32 2012
@@ -68,8 +68,8 @@ public abstract class HttpBase implement
/** The Nutch 'User-Agent' request header */
protected String userAgent = getAgentString(
"NutchCVS", null, "Nutch",
- "http://lucene.apache.org/nutch/bot.html",
- "nutch-agent@lucene.apache.org");
+ "http://nutch.apache.org/bot.html",
+ "agent@nutch.apache.org");
/** The "Accept-Language" request header value. */
@@ -77,6 +77,9 @@ public abstract class HttpBase implement
/** The "Accept" request header value. */
protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+
+ /** The "_ip" request header value. */
+ protected boolean ip_header = false;
/** The default logger */
private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
@@ -117,6 +120,7 @@ public abstract class HttpBase implement
.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
this.accept = conf.get("http.accept", accept);
+ this.ip_header = conf.getBoolean("http.store.ip.address", false);
this.mimeTypes = new MimeUtil(conf);
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.robots.setConf(conf);
@@ -246,6 +250,10 @@ public abstract class HttpBase implement
public boolean getUseHttp11() {
return useHttp11;
}
+
+ public boolean getIP_Header(){
+ return ip_header;
+ }
private static String getAgentString(String agentName,
String agentVersion,
@@ -301,6 +309,7 @@ public abstract class HttpBase implement
logger.info("http.agent = " + userAgent);
logger.info("http.accept.language = " + acceptLanguage);
logger.info("http.accept = " + accept);
+ logger.info("http.store.ip.address = " + ip_header);
}
}
Modified: nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Mon May 21 16:40:32 2012
@@ -89,7 +89,9 @@ public class HttpResponse implements Res
int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
-
+
+ headers.set("_ip", socket.getInetAddress().getHostAddress());
+
// make request
OutputStream req = socket.getOutputStream();
@@ -99,14 +101,20 @@ public class HttpResponse implements Res
} else {
reqStr.append(path);
}
-
+
reqStr.append(" HTTP/1.0\r\n");
reqStr.append("Host: ");
reqStr.append(host);
reqStr.append(portString);
reqStr.append("\r\n");
-
+
+ if(this.http.getConf().getBoolean("http.store.ip.address", true)) {
+ reqStr.append("_ip: ");
+ reqStr.append(http.getIP_Header());
+ reqStr.append("\r\n");
+ }
+
reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
reqStr.append("Accept: ");
Modified: nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java (original)
+++ nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java Mon May 21 16:40:32 2012
@@ -32,7 +32,6 @@ import java.util.concurrent.BlockingQueu
//APACHE imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.log4j.Logger;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
@@ -42,6 +41,10 @@ import org.apache.nutch.protocol.RobotRu
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
+//Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
//JSCH imports
import com.jcraft.jsch.ChannelSftp;
import com.jcraft.jsch.JSch;
@@ -56,7 +59,7 @@ import com.jcraft.jsch.ChannelSftp.LsEnt
*/
public class Sftp implements Protocol {
- private static final Logger logger = Logger.getLogger(Sftp.class);
+ private static final Logger LOG = LoggerFactory.getLogger(Sftp.class);
private static final Map<String, BlockingQueue<ChannelSftp>> channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();
private Configuration configuration;
@@ -92,7 +95,7 @@ public class Sftp implements Protocol {
return po;
}
} catch (MalformedURLException e) {
- logger.error("Bad URL String: " + urlStr, e);
+ LOG.error("Bad URL String: " + urlStr, e);
return null;
} catch (InterruptedException e) {
return null;
@@ -101,14 +104,14 @@ public class Sftp implements Protocol {
} catch (IOException e) {
return null;
} catch (Exception e) {
- logger.error("Unknown Exception in getProtocolOutput()", e);
+ LOG.error("Unknown Exception in getProtocolOutput()", e);
return null;
} finally {
if (channelSftp != null) {
try {
putChannelSftp(sUrl, channelSftp);
} catch (InterruptedException e) {
- logger.error("Cannot return ChannelSftp object to Queue", e);
+ LOG.error("Cannot return ChannelSftp object to Queue", e);
}
}
}
@@ -125,8 +128,7 @@ public class Sftp implements Protocol {
ChannelSftp cSftp = queue.take();
return cSftp;
} catch (InterruptedException e) {
- logger
- .error("Wait for getChannelSftp() interrupted for host: " + host, e);
+ LOG.error("Wait for getChannelSftp() interrupted for host: " + host, e);
throw e;
}
}
@@ -142,8 +144,7 @@ public class Sftp implements Protocol {
try {
queue.put(cSftp);
} catch (InterruptedException e) {
- logger
- .error("Wait for putChannelSftp() interrupted for host: " + host, e);
+ LOG.error("Wait for putChannelSftp() interrupted for host: " + host, e);
throw e;
}
}
@@ -159,11 +160,11 @@ public class Sftp implements Protocol {
bytes = new byte[size];
iStream.read(bytes);
} catch (SftpException e) {
- logger.error("SftpException in getFileProtocolOutput(), file: "
+ LOG.error("SftpException in getFileProtocolOutput(), file: "
+ url.getFile(), e);
throw e;
} catch (IOException e) {
- logger.error("IOException in getFileProtocolOutput(), file: "
+ LOG.error("IOException in getFileProtocolOutput(), file: "
+ url.getFile(), e);
throw e;
} finally {
@@ -219,7 +220,7 @@ public class Sftp implements Protocol {
ProtocolOutput po = new ProtocolOutput(content);
return po;
} catch (SftpException e) {
- logger.error("SftpException in getDirectoryProtocolOutput()", e);
+ LOG.error("SftpException in getDirectoryProtocolOutput()", e);
throw e;
}
}
@@ -254,7 +255,7 @@ public class Sftp implements Protocol {
try {
session = jsch.getSession(user, server, port);
} catch (JSchException e) {
- logger.error("Cannot create JSch session for user: " + user
+ LOG.error("Cannot create JSch session for user: " + user
+ ", host: " + server + ", port: " + port);
return;
}
@@ -271,7 +272,7 @@ public class Sftp implements Protocol {
cSftp = (ChannelSftp) session.openChannel("sftp");
cSftp.connect();
} catch (JSchException e) {
- logger.error("Cannot connect to JSch session for user: " + user
+ LOG.error("Cannot connect to JSch session for user: " + user
+ ", host: " + server + ", port: " + port);
return;
}
@@ -281,7 +282,7 @@ public class Sftp implements Protocol {
try {
queue.put(cSftp);
} catch (InterruptedException e) {
- logger.error("Interrupted during setConf()", e);
+ LOG.error("Interrupted during setConf()", e);
return;
}
channelSftpByHostMap.put(server, queue);