You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/05/21 18:40:33 UTC

svn commit: r1341100 - in /nutch/branches/nutchgora: ./ conf/ src/java/org/apache/nutch/metadata/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-sft...

Author: lewismc
Date: Mon May 21 16:40:32 2012
New Revision: 1341100

URL: http://svn.apache.org/viewvc?rev=1341100&view=rev
Log:
commit to address NUTCH-1360 and update to CHANGES.txt

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/conf/nutch-default.xml
    nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java
    nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon May 21 16:40:32 2012
@@ -1,6 +1,9 @@
 Nutch Change Log
 
 Release nutchgora - Current Development
+
+* NUTCH-1360 Support the storing of IP address connected to when web crawling (lewismc)
+
 * NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)
 
 * NUTCH-1362 Fix error handling of urls with empty fields (lewis, ferdy)

Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Mon May 21 16:40:32 2012
@@ -257,6 +257,13 @@
   </description>
 </property>
 
+<property>
+  <name>http.store.ip.address</name>
+  <value>false</value>
+  <description>Enables us to capture the specific IP address of the 
+  host which we connect to to fetch a page.</description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/metadata/HttpHeaders.java Mon May 21 16:40:32 2012
@@ -46,5 +46,7 @@ public interface HttpHeaders {
   public final static String LAST_MODIFIED = "Last-Modified";
 
   public final static String LOCATION = "Location";
+  
+  public final static String IP_ADDRESS = "_ip";
 
 }

Modified: nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/nutchgora/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon May 21 16:40:32 2012
@@ -68,8 +68,8 @@ public abstract class HttpBase implement
   /** The Nutch 'User-Agent' request header */
   protected String userAgent = getAgentString(
       "NutchCVS", null, "Nutch",
-      "http://lucene.apache.org/nutch/bot.html",
-  "nutch-agent@lucene.apache.org");
+      "http://nutch.apache.org/bot.html",
+  "agent@nutch.apache.org");
 
 
   /** The "Accept-Language" request header value. */
@@ -77,6 +77,9 @@ public abstract class HttpBase implement
   
   /** The "Accept" request header value. */
   protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+  
+  /** The "_ip" request header value. */
+  protected boolean ip_header = false;
 
   /** The default logger */
   private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
@@ -117,6 +120,7 @@ public abstract class HttpBase implement
         .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
     this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
     this.accept = conf.get("http.accept", accept);
+    this.ip_header = conf.getBoolean("http.store.ip.address", false);
     this.mimeTypes = new MimeUtil(conf);
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
     this.robots.setConf(conf);
@@ -246,6 +250,10 @@ public abstract class HttpBase implement
   public boolean getUseHttp11() {
     return useHttp11;
   }
+  
+  public boolean getIP_Header(){
+	return ip_header;
+  }
 
   private static String getAgentString(String agentName,
       String agentVersion,
@@ -301,6 +309,7 @@ public abstract class HttpBase implement
       logger.info("http.agent = " + userAgent);
       logger.info("http.accept.language = " + acceptLanguage);
       logger.info("http.accept = " + accept);
+      logger.info("http.store.ip.address = " + ip_header);
     }
   }
 

Modified: nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/branches/nutchgora/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Mon May 21 16:40:32 2012
@@ -89,7 +89,9 @@ public class HttpResponse implements Res
       int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
-
+      
+      headers.set("_ip", socket.getInetAddress().getHostAddress());
+      
       // make request
       OutputStream req = socket.getOutputStream();
 
@@ -99,14 +101,20 @@ public class HttpResponse implements Res
       } else {
       	reqStr.append(path);
       }
-
+      
       reqStr.append(" HTTP/1.0\r\n");
 
       reqStr.append("Host: ");
       reqStr.append(host);
       reqStr.append(portString);
       reqStr.append("\r\n");
-
+      
+      if(this.http.getConf().getBoolean("http.store.ip.address", true)) {
+        reqStr.append("_ip: ");
+    	reqStr.append(http.getIP_Header());
+    	reqStr.append("\r\n");
+      }
+      
       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
 
       reqStr.append("Accept: ");

Modified: nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1341100&r1=1341099&r2=1341100&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java (original)
+++ nutch/branches/nutchgora/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java Mon May 21 16:40:32 2012
@@ -32,7 +32,6 @@ import java.util.concurrent.BlockingQueu
 
 //APACHE imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.log4j.Logger;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
@@ -42,6 +41,10 @@ import org.apache.nutch.protocol.RobotRu
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 
+//Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 //JSCH imports
 import com.jcraft.jsch.ChannelSftp;
 import com.jcraft.jsch.JSch;
@@ -56,7 +59,7 @@ import com.jcraft.jsch.ChannelSftp.LsEnt
  */
 public class Sftp implements Protocol {
 
-  private static final Logger logger = Logger.getLogger(Sftp.class);
+  private static final Logger LOG = LoggerFactory.getLogger(Sftp.class);
   private static final Map<String, BlockingQueue<ChannelSftp>> channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();
 
   private Configuration configuration;
@@ -92,7 +95,7 @@ public class Sftp implements Protocol {
         return po;
       }
     } catch (MalformedURLException e) {
-      logger.error("Bad URL String: " + urlStr, e);
+      LOG.error("Bad URL String: " + urlStr, e);
       return null;
     } catch (InterruptedException e) {
       return null;
@@ -101,14 +104,14 @@ public class Sftp implements Protocol {
     } catch (IOException e) {
       return null;
     } catch (Exception e) {
-      logger.error("Unknown Exception in getProtocolOutput()", e);
+      LOG.error("Unknown Exception in getProtocolOutput()", e);
       return null;
     } finally {
       if (channelSftp != null) {
         try {
           putChannelSftp(sUrl, channelSftp);
         } catch (InterruptedException e) {
-          logger.error("Cannot return ChannelSftp object to Queue", e);
+          LOG.error("Cannot return ChannelSftp object to Queue", e);
         }
       }
     }
@@ -125,8 +128,7 @@ public class Sftp implements Protocol {
       ChannelSftp cSftp = queue.take();
       return cSftp;
     } catch (InterruptedException e) {
-      logger
-          .error("Wait for getChannelSftp() interrupted for host: " + host, e);
+      LOG.error("Wait for getChannelSftp() interrupted for host: " + host, e);
       throw e;
     }
   }
@@ -142,8 +144,7 @@ public class Sftp implements Protocol {
     try {
       queue.put(cSftp);
     } catch (InterruptedException e) {
-      logger
-          .error("Wait for putChannelSftp() interrupted for host: " + host, e);
+      LOG.error("Wait for putChannelSftp() interrupted for host: " + host, e);
       throw e;
     }
   }
@@ -159,11 +160,11 @@ public class Sftp implements Protocol {
       bytes = new byte[size];
       iStream.read(bytes);
     } catch (SftpException e) {
-      logger.error("SftpException in getFileProtocolOutput(), file: "
+      LOG.error("SftpException in getFileProtocolOutput(), file: "
           + url.getFile(), e);
       throw e;
     } catch (IOException e) {
-      logger.error("IOException in getFileProtocolOutput(), file: "
+      LOG.error("IOException in getFileProtocolOutput(), file: "
           + url.getFile(), e);
       throw e;
     } finally {
@@ -219,7 +220,7 @@ public class Sftp implements Protocol {
       ProtocolOutput po = new ProtocolOutput(content);
       return po;
     } catch (SftpException e) {
-      logger.error("SftpException in getDirectoryProtocolOutput()", e);
+      LOG.error("SftpException in getDirectoryProtocolOutput()", e);
       throw e;
     }
   }
@@ -254,7 +255,7 @@ public class Sftp implements Protocol {
       try {
         session = jsch.getSession(user, server, port);
       } catch (JSchException e) {
-        logger.error("Cannot create JSch session for user: " + user
+        LOG.error("Cannot create JSch session for user: " + user
             + ", host: " + server + ", port: " + port);
         return;
       }
@@ -271,7 +272,7 @@ public class Sftp implements Protocol {
         cSftp = (ChannelSftp) session.openChannel("sftp");
         cSftp.connect();
       } catch (JSchException e) {
-        logger.error("Cannot connect to JSch session for user: " + user
+        LOG.error("Cannot connect to JSch session for user: " + user
             + ", host: " + server + ", port: " + port);
         return;
       }
@@ -281,7 +282,7 @@ public class Sftp implements Protocol {
       try {
         queue.put(cSftp);
       } catch (InterruptedException e) {
-        logger.error("Interrupted during setConf()", e);
+        LOG.error("Interrupted during setConf()", e);
         return;
       }
       channelSftpByHostMap.put(server, queue);