You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/09 13:52:00 UTC

svn commit: r189747 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/

Author: ab
Date: Thu Jun  9 04:52:00 2005
New Revision: 189747

URL: http://svn.apache.org/viewcvs?rev=189747&view=rev
Log:
Make ProtocolStatus and ParseStatus a VersionedWritable - this helps to
manage different versions of segment data.

Add lastModified field to ProtocolStatus - this is useful to keep accessible
in order to quickly calculate fetch times.

Correct naming of some status codes, so that they are more consistent.

NOTE: Segments created with the code base _later_ than rev. r179436
(2005-06-02) will have to be re-created, because there is no way to keep
compatibility with their format.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jun  9 04:52:00 2005
@@ -172,7 +172,7 @@
                 }
                 break;
               case ProtocolStatus.GONE:
-              case ProtocolStatus.NOT_FOUND:
+              case ProtocolStatus.NOTFOUND:
               case ProtocolStatus.ACCESS_DENIED:
               case ProtocolStatus.ROBOTS_DENIED:
               case ProtocolStatus.RETRY:

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Thu Jun  9 04:52:00 2005
@@ -63,7 +63,7 @@
   private static final byte[] oldToNewMap = {
           ProtocolStatus.RETRY,
           ProtocolStatus.SUCCESS,
-          ProtocolStatus.NOT_FOUND,
+          ProtocolStatus.NOTFOUND,
           ProtocolStatus.FAILED,
           ProtocolStatus.RETRY
   };

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Thu Jun  9 04:52:00 2005
@@ -10,13 +10,15 @@
 import java.io.IOException;
 import java.util.Properties;
 
-import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.VersionedWritable;
 import org.apache.nutch.io.WritableUtils;
 
 /**
  * @author Andrzej Bialecki <ab@getopt.org>
  */
-public class ParseStatus implements Writable {
+public class ParseStatus extends VersionedWritable {
+  
+  private final static byte VERSION = 1;
   
   // Primary status codes:
   
@@ -68,7 +70,11 @@
   private short minorCode = 0;
   private String[] args = null;
   
-  protected ParseStatus() {
+  public byte getVersion() {
+    return VERSION;
+  }
+
+  public ParseStatus() {
     
   }
   
@@ -111,12 +117,14 @@
   }
   
   public void readFields(DataInput in) throws IOException {
+    super.readFields(in);     // check version
     majorCode = in.readByte();
     minorCode = in.readShort();
     args = WritableUtils.readCompressedStringArray(in);
   }
   
   public void write(DataOutput out) throws IOException {
+    super.write(out);         // write out version
     out.writeByte(majorCode);
     out.writeShort(minorCode);
     WritableUtils.writeCompressedStringArray(out, args);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Thu Jun  9 04:52:00 2005
@@ -20,14 +20,16 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.VersionedWritable;
 import org.apache.nutch.io.WritableUtils;
 import org.apache.nutch.parse.ParseStatus;
 
 /**
  * @author Andrzej Bialecki <ab@getopt.org>
  */
-public class ProtocolStatus implements Writable {
+public class ProtocolStatus extends VersionedWritable {
+  
+  private final static byte VERSION = 1;
   
   /** Content was retrieved without errors. */
   public static final int SUCCESS              = 1;
@@ -43,7 +45,7 @@
   /** Resource has moved temporarily. New url should be found in args. */
   public static final int TEMP_MOVED           = 13;
   /** Resource was not found. */
-  public static final int NOT_FOUND            = 14;
+  public static final int NOTFOUND            = 14;
   /** Temporary failure. Application may retry immediately. */
   public static final int RETRY                = 15;
   /** Unspecified exception occured. Further information may be provided in args. */
@@ -53,37 +55,63 @@
   /** Access denied by robots.txt rules. */
   public static final int ROBOTS_DENIED        = 18;
   /** Too many redirects. */
-  public static final int REDIR_EXCEED         = 19;
+  public static final int REDIR_EXCEEDED         = 19;
   /** Not fetching. */
   public static final int NOTFETCHING          = 20;
   /** Unchanged since the last fetch. */
   public static final int NOTMODIFIED          = 21;
   
-  
+  // Useful static instances for status codes that don't usually require any
+  // additional arguments.
   public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS);
-  public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
   public static final ProtocolStatus STATUS_FAILED = new ProtocolStatus(FAILED);
+  public static final ProtocolStatus STATUS_GONE = new ProtocolStatus(GONE);
+  public static final ProtocolStatus STATUS_NOTFOUND = new ProtocolStatus(NOTFOUND);
+  public static final ProtocolStatus STATUS_RETRY = new ProtocolStatus(RETRY);
+  public static final ProtocolStatus STATUS_ROBOTS_DENIED = new ProtocolStatus(ROBOTS_DENIED);
+  public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(REDIR_EXCEEDED);
+  public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
   public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED);
   
   private int code;
+  private long lastModified;
   private String[] args;
   
-  protected ProtocolStatus() {
+  public ProtocolStatus() {
     
   }
 
+  public byte getVersion() {
+    return VERSION;
+  }
+
   public ProtocolStatus(int code, String[] args) {
     this.code = code;
     this.args = args;
   }
   
+  public ProtocolStatus(int code, String[] args, long lastModified) {
+    this.code = code;
+    this.args = args;
+    this.lastModified = lastModified;
+  }
+  
   public ProtocolStatus(int code) {
     this(code, null);
   }
   
+  public ProtocolStatus(int code, long lastModified) {
+    this(code, null, lastModified);
+  }
+  
   public ProtocolStatus(int code, Object message) {
+    this(code, message, 0L);
+  }
+  
+  public ProtocolStatus(int code, Object message, long lastModified) {
     this.code = code;
-    this.args = new String[]{String.valueOf(message)};
+    this.lastModified = lastModified;
+    if (message != null) this.args = new String[]{String.valueOf(message)};
   }
   
   public ProtocolStatus(Throwable t) {
@@ -97,15 +125,23 @@
   }
   
   public void readFields(DataInput in) throws IOException {
+    super.readFields(in);       // check version
     code = in.readByte();
+    lastModified = in.readLong();
     args = WritableUtils.readCompressedStringArray(in);
   }
   
   public void write(DataOutput out) throws IOException {
+    super.write(out);           // write version
     out.writeByte((byte)code);
+    out.writeLong(lastModified);
     WritableUtils.writeCompressedStringArray(out, args);
   }
 
+  public void setArgs(String[] args) {
+    this.args = args;
+  }
+  
   public String[] getArgs() {
     return args;
   }
@@ -114,20 +150,56 @@
     return code;
   }
   
+  public void setCode(int code) {
+    this.code = code;
+  }
+  
   public boolean isSuccess() {
     return code == SUCCESS; 
   }
   
+  public boolean isTransientFailure() {
+    return
+        code == ACCESS_DENIED ||
+        code == EXCEPTION ||
+        code == REDIR_EXCEEDED ||
+        code == RETRY ||
+        code == TEMP_MOVED ||
+        code == PROTO_NOT_FOUND; 
+  }
+  
+  public boolean isPermanentFailure() {
+    return
+        code == FAILED ||
+        code == GONE ||
+        code == MOVED ||
+        code == NOTFOUND ||
+        code == ROBOTS_DENIED;
+  }
+  
   public String getMessage() {
     if (args != null && args.length > 0) return args[0];
     return null;
   }
   
+  public void setMessage(String msg) {
+    if (args != null && args.length > 0) args[0] = msg;
+    else args = new String[] {msg};
+  }
+  
+  public long getLastModified() {
+    return lastModified;
+  }
+  
+  public void setLastModified(long lastModified) {
+    this.lastModified = lastModified;
+  }
+  
   public boolean equals(Object o) {
     if (o == null) return false;
     if (!(o instanceof ProtocolStatus)) return false;
     ProtocolStatus other = (ProtocolStatus)o;
-    if (this.code != other.code) return false;
+    if (this.code != other.code || this.lastModified != other.lastModified) return false;
     if (this.args == null) {
       if (other.args == null) return true;
       else return false;
@@ -143,7 +215,7 @@
   
   public String toString() {
     StringBuffer res = new StringBuffer();
-    res.append("(" + code + ")");
+    res.append("(" + code + "), lastModified=" + lastModified);
     if (args != null) {
       if (args.length == 1) {
         res.append(": " + String.valueOf(args[0]));

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=189747&r1=189746&r2=189747&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Thu Jun  9 04:52:00 2005
@@ -248,7 +248,7 @@
         } else if (code == 401) { // requires authorization
           LOG.fine("401 Authentication Required");
           if (redirects == MAX_REDIRECTS)
-                  return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.REDIR_EXCEED,
+                  return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED,
                           "Too many redirects: " + urlString));
           Properties p = response.toContent().getMetadata();
           if (p instanceof MultiProperties) {
@@ -259,7 +259,7 @@
           }
           redirects++;
         } else if (code == 404) {
-          return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.NOT_FOUND, url));
+          return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.NOTFOUND, url));
         } else if (code == 410) { // permanently GONE
           return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.GONE, url));
         } else {