You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/07/10 18:32:08 UTC
svn commit: r1359762 - in /nutch/branches/2.x: ./ conf/
src/java/org/apache/nutch/metadata/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
src/plugin/protocol-sftp/src/...
Author: lewismc
Date: Tue Jul 10 16:32:08 2012
New Revision: 1359762
URL: http://svn.apache.org/viewvc?rev=1359762&view=rev
Log:
revert NUTCH-1360
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Jul 10 16:32:08 2012
@@ -45,8 +45,6 @@ Full Jira report - https://issues.apache
* NUTCH-1361 Fix mishandling of malformed urls in generator job (Jason Trost via lewismc)
-* NUTCH-1360 Support the storing of IP address connected to when web crawling (lewismc)
-
* NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)
* NUTCH-1362 Fix error handling of urls with empty fields (lewis, ferdy)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Tue Jul 10 16:32:08 2012
@@ -257,13 +257,6 @@
</description>
</property>
-<property>
- <name>http.store.ip.address</name>
- <value>false</value>
- <description>Enables us to capture the specific IP address of the
- host which we connect to to fetch a page.</description>
-</property>
-
<!-- FTP properties -->
<property>
Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java Tue Jul 10 16:32:08 2012
@@ -46,7 +46,5 @@ public interface HttpHeaders {
public final static String LAST_MODIFIED = "Last-Modified";
public final static String LOCATION = "Location";
-
- public final static String IP_ADDRESS = "_ip";
}
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Jul 10 16:32:08 2012
@@ -68,8 +68,8 @@ public abstract class HttpBase implement
/** The Nutch 'User-Agent' request header */
protected String userAgent = getAgentString(
"NutchCVS", null, "Nutch",
- "http://nutch.apache.org/bot.html",
- "agent@nutch.apache.org");
+ "http://lucene.apache.org/nutch/bot.html",
+ "nutch-agent@lucene.apache.org");
/** The "Accept-Language" request header value. */
@@ -77,9 +77,6 @@ public abstract class HttpBase implement
/** The "Accept" request header value. */
protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
-
- /** The "_ip" request header value. */
- protected boolean ip_header = false;
/** The default logger */
private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
@@ -120,7 +117,6 @@ public abstract class HttpBase implement
.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
this.accept = conf.get("http.accept", accept);
- this.ip_header = conf.getBoolean("http.store.ip.address", false);
this.mimeTypes = new MimeUtil(conf);
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.robots.setConf(conf);
@@ -250,10 +246,6 @@ public abstract class HttpBase implement
public boolean getUseHttp11() {
return useHttp11;
}
-
- public boolean getIP_Header(){
- return ip_header;
- }
private static String getAgentString(String agentName,
String agentVersion,
@@ -309,7 +301,6 @@ public abstract class HttpBase implement
logger.info("http.agent = " + userAgent);
logger.info("http.accept.language = " + acceptLanguage);
logger.info("http.accept = " + accept);
- logger.info("http.store.ip.address = " + ip_header);
}
}
Modified: nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Jul 10 16:32:08 2012
@@ -89,9 +89,7 @@ public class HttpResponse implements Res
int sockPort = http.useProxy() ? http.getProxyPort() : port;
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort);
socket.connect(sockAddr, http.getTimeout());
-
- headers.set("_ip", socket.getInetAddress().getHostAddress());
-
+
// make request
OutputStream req = socket.getOutputStream();
@@ -101,20 +99,14 @@ public class HttpResponse implements Res
} else {
reqStr.append(path);
}
-
+
reqStr.append(" HTTP/1.0\r\n");
reqStr.append("Host: ");
reqStr.append(host);
reqStr.append(portString);
reqStr.append("\r\n");
-
- if(this.http.getConf().getBoolean("http.store.ip.address", true)) {
- reqStr.append("_ip: ");
- reqStr.append(http.getIP_Header());
- reqStr.append("\r\n");
- }
-
+
reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
reqStr.append("Accept: ");
Modified: nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1359762&r1=1359761&r2=1359762&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java Tue Jul 10 16:32:08 2012
@@ -32,6 +32,7 @@ import java.util.concurrent.BlockingQueu
//APACHE imports
import org.apache.hadoop.conf.Configuration;
+import org.apache.log4j.Logger;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
@@ -41,10 +42,6 @@ import org.apache.nutch.protocol.RobotRu
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
-//Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
//JSCH imports
import com.jcraft.jsch.ChannelSftp;
import com.jcraft.jsch.JSch;
@@ -59,7 +56,7 @@ import com.jcraft.jsch.ChannelSftp.LsEnt
*/
public class Sftp implements Protocol {
- private static final Logger LOG = LoggerFactory.getLogger(Sftp.class);
+ private static final Logger logger = Logger.getLogger(Sftp.class);
private static final Map<String, BlockingQueue<ChannelSftp>> channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();
private Configuration configuration;
@@ -95,7 +92,7 @@ public class Sftp implements Protocol {
return po;
}
} catch (MalformedURLException e) {
- LOG.error("Bad URL String: " + urlStr, e);
+ logger.error("Bad URL String: " + urlStr, e);
return null;
} catch (InterruptedException e) {
return null;
@@ -104,14 +101,14 @@ public class Sftp implements Protocol {
} catch (IOException e) {
return null;
} catch (Exception e) {
- LOG.error("Unknown Exception in getProtocolOutput()", e);
+ logger.error("Unknown Exception in getProtocolOutput()", e);
return null;
} finally {
if (channelSftp != null) {
try {
putChannelSftp(sUrl, channelSftp);
} catch (InterruptedException e) {
- LOG.error("Cannot return ChannelSftp object to Queue", e);
+ logger.error("Cannot return ChannelSftp object to Queue", e);
}
}
}
@@ -128,7 +125,8 @@ public class Sftp implements Protocol {
ChannelSftp cSftp = queue.take();
return cSftp;
} catch (InterruptedException e) {
- LOG.error("Wait for getChannelSftp() interrupted for host: " + host, e);
+ logger
+ .error("Wait for getChannelSftp() interrupted for host: " + host, e);
throw e;
}
}
@@ -144,7 +142,8 @@ public class Sftp implements Protocol {
try {
queue.put(cSftp);
} catch (InterruptedException e) {
- LOG.error("Wait for putChannelSftp() interrupted for host: " + host, e);
+ logger
+ .error("Wait for putChannelSftp() interrupted for host: " + host, e);
throw e;
}
}
@@ -160,11 +159,11 @@ public class Sftp implements Protocol {
bytes = new byte[size];
iStream.read(bytes);
} catch (SftpException e) {
- LOG.error("SftpException in getFileProtocolOutput(), file: "
+ logger.error("SftpException in getFileProtocolOutput(), file: "
+ url.getFile(), e);
throw e;
} catch (IOException e) {
- LOG.error("IOException in getFileProtocolOutput(), file: "
+ logger.error("IOException in getFileProtocolOutput(), file: "
+ url.getFile(), e);
throw e;
} finally {
@@ -220,7 +219,7 @@ public class Sftp implements Protocol {
ProtocolOutput po = new ProtocolOutput(content);
return po;
} catch (SftpException e) {
- LOG.error("SftpException in getDirectoryProtocolOutput()", e);
+ logger.error("SftpException in getDirectoryProtocolOutput()", e);
throw e;
}
}
@@ -255,7 +254,7 @@ public class Sftp implements Protocol {
try {
session = jsch.getSession(user, server, port);
} catch (JSchException e) {
- LOG.error("Cannot create JSch session for user: " + user
+ logger.error("Cannot create JSch session for user: " + user
+ ", host: " + server + ", port: " + port);
return;
}
@@ -272,7 +271,7 @@ public class Sftp implements Protocol {
cSftp = (ChannelSftp) session.openChannel("sftp");
cSftp.connect();
} catch (JSchException e) {
- LOG.error("Cannot connect to JSch session for user: " + user
+ logger.error("Cannot connect to JSch session for user: " + user
+ ", host: " + server + ", port: " + port);
return;
}
@@ -282,7 +281,7 @@ public class Sftp implements Protocol {
try {
queue.put(cSftp);
} catch (InterruptedException e) {
- LOG.error("Interrupted during setConf()", e);
+ logger.error("Interrupted during setConf()", e);
return;
}
channelSftpByHostMap.put(server, queue);