You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/09 13:52:00 UTC
svn commit: r189747 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/
java/org/apache/nutch/protocol/
plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
Author: ab
Date: Thu Jun 9 04:52:00 2005
New Revision: 189747
URL: http://svn.apache.org/viewcvs?rev=189747&view=rev
Log:
Make ProtocolStatus and ParseStatus a VersionedWritable - this helps to
manage different versions of segment data.
Add lastModified field to ProtocolStatus - this is useful to keep accessible
in order to quickly calculate fetch times.
Correct naming of some status codes, so that they are more consistent.
NOTE: Segments created with the code base _later_ than rev. r179436
(2005-06-02) will have to be re-created, because there is no way to keep
compatibility with their format.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jun 9 04:52:00 2005
@@ -172,7 +172,7 @@
}
break;
case ProtocolStatus.GONE:
- case ProtocolStatus.NOT_FOUND:
+ case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
case ProtocolStatus.RETRY:
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Thu Jun 9 04:52:00 2005
@@ -63,7 +63,7 @@
private static final byte[] oldToNewMap = {
ProtocolStatus.RETRY,
ProtocolStatus.SUCCESS,
- ProtocolStatus.NOT_FOUND,
+ ProtocolStatus.NOTFOUND,
ProtocolStatus.FAILED,
ProtocolStatus.RETRY
};
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Thu Jun 9 04:52:00 2005
@@ -10,13 +10,15 @@
import java.io.IOException;
import java.util.Properties;
-import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.VersionedWritable;
import org.apache.nutch.io.WritableUtils;
/**
* @author Andrzej Bialecki <ab@getopt.org>
*/
-public class ParseStatus implements Writable {
+public class ParseStatus extends VersionedWritable {
+
+ private final static byte VERSION = 1;
// Primary status codes:
@@ -68,7 +70,11 @@
private short minorCode = 0;
private String[] args = null;
- protected ParseStatus() {
+ public byte getVersion() {
+ return VERSION;
+ }
+
+ public ParseStatus() {
}
@@ -111,12 +117,14 @@
}
public void readFields(DataInput in) throws IOException {
+ super.readFields(in); // check version
majorCode = in.readByte();
minorCode = in.readShort();
args = WritableUtils.readCompressedStringArray(in);
}
public void write(DataOutput out) throws IOException {
+ super.write(out); // write out version
out.writeByte(majorCode);
out.writeShort(minorCode);
WritableUtils.writeCompressedStringArray(out, args);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Thu Jun 9 04:52:00 2005
@@ -20,14 +20,16 @@
import java.io.DataOutput;
import java.io.IOException;
-import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.VersionedWritable;
import org.apache.nutch.io.WritableUtils;
import org.apache.nutch.parse.ParseStatus;
/**
* @author Andrzej Bialecki <ab@getopt.org>
*/
-public class ProtocolStatus implements Writable {
+public class ProtocolStatus extends VersionedWritable {
+
+ private final static byte VERSION = 1;
/** Content was retrieved without errors. */
public static final int SUCCESS = 1;
@@ -43,7 +45,7 @@
/** Resource has moved temporarily. New url should be found in args. */
public static final int TEMP_MOVED = 13;
/** Resource was not found. */
- public static final int NOT_FOUND = 14;
+ public static final int NOTFOUND = 14;
/** Temporary failure. Application may retry immediately. */
public static final int RETRY = 15;
/** Unspecified exception occured. Further information may be provided in args. */
@@ -53,37 +55,63 @@
/** Access denied by robots.txt rules. */
public static final int ROBOTS_DENIED = 18;
/** Too many redirects. */
- public static final int REDIR_EXCEED = 19;
+ public static final int REDIR_EXCEEDED = 19;
/** Not fetching. */
public static final int NOTFETCHING = 20;
/** Unchanged since the last fetch. */
public static final int NOTMODIFIED = 21;
-
+ // Useful static instances for status codes that don't usually require any
+ // additional arguments.
public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS);
- public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
public static final ProtocolStatus STATUS_FAILED = new ProtocolStatus(FAILED);
+ public static final ProtocolStatus STATUS_GONE = new ProtocolStatus(GONE);
+ public static final ProtocolStatus STATUS_NOTFOUND = new ProtocolStatus(NOTFOUND);
+ public static final ProtocolStatus STATUS_RETRY = new ProtocolStatus(RETRY);
+ public static final ProtocolStatus STATUS_ROBOTS_DENIED = new ProtocolStatus(ROBOTS_DENIED);
+ public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(REDIR_EXCEEDED);
+ public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED);
private int code;
+ private long lastModified;
private String[] args;
- protected ProtocolStatus() {
+ public ProtocolStatus() {
}
+ public byte getVersion() {
+ return VERSION;
+ }
+
public ProtocolStatus(int code, String[] args) {
this.code = code;
this.args = args;
}
+ public ProtocolStatus(int code, String[] args, long lastModified) {
+ this.code = code;
+ this.args = args;
+ this.lastModified = lastModified;
+ }
+
public ProtocolStatus(int code) {
this(code, null);
}
+ public ProtocolStatus(int code, long lastModified) {
+ this(code, null, lastModified);
+ }
+
public ProtocolStatus(int code, Object message) {
+ this(code, message, 0L);
+ }
+
+ public ProtocolStatus(int code, Object message, long lastModified) {
this.code = code;
- this.args = new String[]{String.valueOf(message)};
+ this.lastModified = lastModified;
+ if (message != null) this.args = new String[]{String.valueOf(message)};
}
public ProtocolStatus(Throwable t) {
@@ -97,15 +125,23 @@
}
public void readFields(DataInput in) throws IOException {
+ super.readFields(in); // check version
code = in.readByte();
+ lastModified = in.readLong();
args = WritableUtils.readCompressedStringArray(in);
}
public void write(DataOutput out) throws IOException {
+ super.write(out); // write version
out.writeByte((byte)code);
+ out.writeLong(lastModified);
WritableUtils.writeCompressedStringArray(out, args);
}
+ public void setArgs(String[] args) {
+ this.args = args;
+ }
+
public String[] getArgs() {
return args;
}
@@ -114,20 +150,56 @@
return code;
}
+ public void setCode(int code) {
+ this.code = code;
+ }
+
public boolean isSuccess() {
return code == SUCCESS;
}
+ public boolean isTransientFailure() {
+ return
+ code == ACCESS_DENIED ||
+ code == EXCEPTION ||
+ code == REDIR_EXCEEDED ||
+ code == RETRY ||
+ code == TEMP_MOVED ||
+ code == PROTO_NOT_FOUND;
+ }
+
+ public boolean isPermanentFailure() {
+ return
+ code == FAILED ||
+ code == GONE ||
+ code == MOVED ||
+ code == NOTFOUND ||
+ code == ROBOTS_DENIED;
+ }
+
public String getMessage() {
if (args != null && args.length > 0) return args[0];
return null;
}
+ public void setMessage(String msg) {
+ if (args != null && args.length > 0) args[0] = msg;
+ else args = new String[] {msg};
+ }
+
+ public long getLastModified() {
+ return lastModified;
+ }
+
+ public void setLastModified(long lastModified) {
+ this.lastModified = lastModified;
+ }
+
public boolean equals(Object o) {
if (o == null) return false;
if (!(o instanceof ProtocolStatus)) return false;
ProtocolStatus other = (ProtocolStatus)o;
- if (this.code != other.code) return false;
+ if (this.code != other.code || this.lastModified != other.lastModified) return false;
if (this.args == null) {
if (other.args == null) return true;
else return false;
@@ -143,7 +215,7 @@
public String toString() {
StringBuffer res = new StringBuffer();
- res.append("(" + code + ")");
+ res.append("(" + code + "), lastModified=" + lastModified);
if (args != null) {
if (args.length == 1) {
res.append(": " + String.valueOf(args[0]));
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu Jun 9 04:52:00 2005
@@ -248,7 +248,7 @@
} else if (code == 401) { // requires authorization
LOG.fine("401 Authentication Required");
if (redirects == MAX_REDIRECTS)
- return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.REDIR_EXCEED,
+ return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
"Too many redirects: " + urlString));
Properties p = response.toContent().getMetadata();
if (p instanceof MultiProperties) {
@@ -259,7 +259,7 @@
}
redirects++;
} else if (code == 404) {
- return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.NOT_FOUND, url));
+ return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.NOTFOUND, url));
} else if (code == 410) { // permanently GONE
return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.GONE, url));
} else {