You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/11/28 22:02:12 UTC

svn commit: r480207 - in /lucene/nutch/trunk/src: java/org/apache/nutch/metadata/ java/org/apache/nutch/protocol/ plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/

Author: ab
Date: Tue Nov 28 13:02:10 2006
New Revision: 480207

URL: http://svn.apache.org/viewvc?view=rev&rev=480207
Log:
Use SpellCheckedMetadata only when necessary, i.e. only when collecting
metadata from unreliable sources such as HTTP headers.

* Metadata: fix a bug where SpellCheckedMetadata would try to normalize
  metadata names during (de)serialization.

* Content: should use regular Metadata by default, and when de-serializing.

* fix HTTP protocol plugins to use SpellCheckedMetadata, where it's really
  necessary.


Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue Nov 28 13:02:10 2006
@@ -92,6 +92,10 @@
    * @return the values associated to a metadata name.
    */
   public String[] getValues(final String name) {
+    return _getValues(name);
+  }
+  
+  private String[] _getValues(final String name) {
     String[] values = metadata.get(name);
     if (values == null) {
       values = new String[0];
@@ -174,8 +178,8 @@
 
     String[] names = names();
     for (int i = 0; i < names.length; i++) {
-      String[] otherValues = other.getValues(names[i]);
-      String[] thisValues = getValues(names[i]);
+      String[] otherValues = other._getValues(names[i]);
+      String[] thisValues = _getValues(names[i]);
       if (otherValues.length != thisValues.length) {
         return false;
       }
@@ -192,7 +196,7 @@
     StringBuffer buf = new StringBuffer();
     String[] names = names();
     for (int i = 0; i < names.length; i++) {
-      String[] values = getValues(names[i]);
+      String[] values = _getValues(names[i]);
       for (int j = 0; j < values.length; j++) {
         buf.append(names[i])
            .append("=")
@@ -209,7 +213,7 @@
     String[] names = names();
     for (int i = 0; i < names.length; i++) {
       Text.writeString(out, names[i]);
-      values = getValues(names[i]);
+      values = _getValues(names[i]);
       int cnt = 0;
       for (int j = 0; j < values.length; j++) {
         if (values[j] != null)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Tue Nov 28 13:02:10 2006
@@ -31,7 +31,6 @@
 import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.mime.MimeType;
 import org.apache.nutch.util.mime.MimeTypeException;
@@ -97,7 +96,7 @@
 
   protected final void readFieldsCompressed(DataInput in) throws IOException {
     version = in.readByte();
-    metadata = new SpellCheckedMetadata();
+    metadata = new Metadata();
     switch (version) {
     case 0:
     case 1:

Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Nov 28 13:02:10 2006
@@ -31,6 +31,7 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.http.api.HttpBase;
@@ -47,7 +48,7 @@
   private String base;
   private byte[] content;
   private int code;
-  private Metadata headers = new Metadata();
+  private Metadata headers = new SpellCheckedMetadata();
 
 
   public HttpResponse(HttpBase http, URL url, CrawlDatum datum)

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Tue Nov 28 13:02:10 2006
@@ -37,6 +37,7 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.util.LogUtil;
@@ -61,7 +62,7 @@
 
   private int code;
 
-  private Metadata headers = new Metadata();
+  private Metadata headers = new SpellCheckedMetadata();
 
   
   public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws IOException {