You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/07/10 18:32:08 UTC

svn commit: r1359762 - in /nutch/branches/2.x: ./ conf/ src/java/org/apache/nutch/metadata/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-sftp/src/...

Author: lewismc
Date: Tue Jul 10 16:32:08 2012
New Revision: 1359762

URL: http://svn.apache.org/viewvc?rev=1359762&view=rev
Log:
revert NUTCH-1360

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
    nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Jul 10 16:32:08 2012
@@ -45,8 +45,6 @@ Full Jira report - https://issues.apache
 
 * NUTCH-1361 Fix mishandling of malformed urls in generator job (Jason Trost via lewismc)
 
-* NUTCH-1360 Support the storing of IP address connected to when web crawling (lewismc)
-
 * NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)
 
 * NUTCH-1362 Fix error handling of urls with empty fields (lewis, ferdy)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Tue Jul 10 16:32:08 2012
@@ -257,13 +257,6 @@
   </description>
 </property>
 
-<property>
-  <name>http.store.ip.address</name>
-  <value>false</value>
-  <description>Enables us to capture the specific IP address of the 
-  host which we connect to to fetch a page.</description>
-</property>
-
 <!-- FTP properties -->
 
 <property>

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java Tue Jul 10 16:32:08 2012
@@ -46,7 +46,5 @@ public interface HttpHeaders {
   public final static String LAST_MODIFIED = "Last-Modified";
 
   public final static String LOCATION = "Location";
-  
-  public final static String IP_ADDRESS = "_ip";
 
 }

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Jul 10 16:32:08 2012
@@ -68,8 +68,8 @@ public abstract class HttpBase implement
   /** The Nutch 'User-Agent' request header */
   protected String userAgent = getAgentString(
       "NutchCVS", null, "Nutch",
-      "http://nutch.apache.org/bot.html",
-  "agent@nutch.apache.org");
+      "http://lucene.apache.org/nutch/bot.html",
+  "nutch-agent@lucene.apache.org");
 
 
   /** The "Accept-Language" request header value. */
@@ -77,9 +77,6 @@ public abstract class HttpBase implement
   
   /** The "Accept" request header value. */
   protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
-  
-  /** The "_ip" request header value. */
-  protected boolean ip_header = false;
 
   /** The default logger */
   private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
@@ -120,7 +117,6 @@ public abstract class HttpBase implement
         .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
     this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
     this.accept = conf.get("http.accept", accept);
-    this.ip_header = conf.getBoolean("http.store.ip.address", false);
     this.mimeTypes = new MimeUtil(conf);
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
     this.robots.setConf(conf);
@@ -250,10 +246,6 @@ public abstract class HttpBase implement
   public boolean getUseHttp11() {
     return useHttp11;
   }
-  
-  public boolean getIP_Header(){
-	return ip_header;
-  }
 
   private static String getAgentString(String agentName,
       String agentVersion,
@@ -309,7 +301,6 @@ public abstract class HttpBase implement
       logger.info("http.agent = " + userAgent);
       logger.info("http.accept.language = " + acceptLanguage);
       logger.info("http.accept = " + accept);
-      logger.info("http.store.ip.address = " + ip_header);
     }
   }
 

Modified: nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Jul 10 16:32:08 2012
@@ -89,9 +89,7 @@ public class HttpResponse implements Res
       int sockPort = http.useProxy() ? http.getProxyPort() : port;
       InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
-      
-      headers.set("_ip", socket.getInetAddress().getHostAddress());
-      
+
       // make request
       OutputStream req = socket.getOutputStream();
 
@@ -101,20 +99,14 @@ public class HttpResponse implements Res
       } else {
       	reqStr.append(path);
       }
-      
+
       reqStr.append(" HTTP/1.0\r\n");
 
       reqStr.append("Host: ");
       reqStr.append(host);
       reqStr.append(portString);
       reqStr.append("\r\n");
-      
-      if(this.http.getConf().getBoolean("http.store.ip.address", true)) {
-        reqStr.append("_ip: ");
-    	reqStr.append(http.getIP_Header());
-    	reqStr.append("\r\n");
-      }
-      
+
       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
 
       reqStr.append("Accept: ");

Modified: nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java Tue Jul 10 16:32:08 2012
@@ -32,6 +32,7 @@ import java.util.concurrent.BlockingQueu
 
 //APACHE imports
 import org.apache.hadoop.conf.Configuration;
+import org.apache.log4j.Logger;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
@@ -41,10 +42,6 @@ import org.apache.nutch.protocol.RobotRu
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 
-//Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 //JSCH imports
 import com.jcraft.jsch.ChannelSftp;
 import com.jcraft.jsch.JSch;
@@ -59,7 +56,7 @@ import com.jcraft.jsch.ChannelSftp.LsEnt
  */
 public class Sftp implements Protocol {
 
-  private static final Logger LOG = LoggerFactory.getLogger(Sftp.class);
+  private static final Logger logger = Logger.getLogger(Sftp.class);
   private static final Map<String, BlockingQueue<ChannelSftp>> channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();
 
   private Configuration configuration;
@@ -95,7 +92,7 @@ public class Sftp implements Protocol {
         return po;
       }
     } catch (MalformedURLException e) {
-      LOG.error("Bad URL String: " + urlStr, e);
+      logger.error("Bad URL String: " + urlStr, e);
       return null;
     } catch (InterruptedException e) {
       return null;
@@ -104,14 +101,14 @@ public class Sftp implements Protocol {
     } catch (IOException e) {
       return null;
     } catch (Exception e) {
-      LOG.error("Unknown Exception in getProtocolOutput()", e);
+      logger.error("Unknown Exception in getProtocolOutput()", e);
       return null;
     } finally {
       if (channelSftp != null) {
         try {
           putChannelSftp(sUrl, channelSftp);
         } catch (InterruptedException e) {
-          LOG.error("Cannot return ChannelSftp object to Queue", e);
+          logger.error("Cannot return ChannelSftp object to Queue", e);
         }
       }
     }
@@ -128,7 +125,8 @@ public class Sftp implements Protocol {
       ChannelSftp cSftp = queue.take();
       return cSftp;
     } catch (InterruptedException e) {
-      LOG.error("Wait for getChannelSftp() interrupted for host: " + host, e);
+      logger
+          .error("Wait for getChannelSftp() interrupted for host: " + host, e);
       throw e;
     }
   }
@@ -144,7 +142,8 @@ public class Sftp implements Protocol {
     try {
       queue.put(cSftp);
     } catch (InterruptedException e) {
-      LOG.error("Wait for putChannelSftp() interrupted for host: " + host, e);
+      logger
+          .error("Wait for putChannelSftp() interrupted for host: " + host, e);
       throw e;
     }
   }
@@ -160,11 +159,11 @@ public class Sftp implements Protocol {
       bytes = new byte[size];
       iStream.read(bytes);
     } catch (SftpException e) {
-      LOG.error("SftpException in getFileProtocolOutput(), file: "
+      logger.error("SftpException in getFileProtocolOutput(), file: "
           + url.getFile(), e);
       throw e;
     } catch (IOException e) {
-      LOG.error("IOException in getFileProtocolOutput(), file: "
+      logger.error("IOException in getFileProtocolOutput(), file: "
           + url.getFile(), e);
       throw e;
     } finally {
@@ -220,7 +219,7 @@ public class Sftp implements Protocol {
       ProtocolOutput po = new ProtocolOutput(content);
       return po;
     } catch (SftpException e) {
-      LOG.error("SftpException in getDirectoryProtocolOutput()", e);
+      logger.error("SftpException in getDirectoryProtocolOutput()", e);
       throw e;
     }
   }
@@ -255,7 +254,7 @@ public class Sftp implements Protocol {
       try {
         session = jsch.getSession(user, server, port);
       } catch (JSchException e) {
-        LOG.error("Cannot create JSch session for user: " + user
+        logger.error("Cannot create JSch session for user: " + user
             + ", host: " + server + ", port: " + port);
         return;
       }
@@ -272,7 +271,7 @@ public class Sftp implements Protocol {
         cSftp = (ChannelSftp) session.openChannel("sftp");
         cSftp.connect();
       } catch (JSchException e) {
-        LOG.error("Cannot connect to JSch session for user: " + user
+        logger.error("Cannot connect to JSch session for user: " + user
             + ", host: " + server + ", port: " + port);
         return;
       }
@@ -282,7 +281,7 @@ public class Sftp implements Protocol {
       try {
         queue.put(cSftp);
       } catch (InterruptedException e) {
-        LOG.error("Interrupted during setConf()", e);
+        logger.error("Interrupted during setConf()", e);
         return;
       }
       channelSftpByHostMap.put(server, queue);