You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/13 20:46:57 UTC

svn commit: r474464 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/protocol/ src/test/org/apache/nutch/metadata/ src/test/org/apache/nutch/protocol/

Author: siren
Date: Mon Nov 13 11:46:56 2006
New Revision: 474464

URL: http://svn.apache.org/viewvc?view=rev&rev=474464
Log:
NUTCH-395 Increase fetching speed

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
    lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
    lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Nov 13 11:46:56 2006
@@ -71,6 +71,7 @@
 
 22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren)
 
+23. NUTCH-395 - Increase fetching speed (siren)
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Mon Nov 13 11:46:56 2006
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,103 +16,58 @@
  */
 package org.apache.nutch.metadata;
 
-// JDK imports
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
 import java.util.Map;
 import java.util.Properties;
-
-// Commons Lang imports
-import org.apache.commons.lang.StringUtils;
-
-// Hadoop imports
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
 
 /**
- * A syntax tolerant and multi-valued metadata container.
- *
- * All the static String fields declared by this class are used as reference
- * names for syntax correction on meta-data naming.
+ * A multi-valued metadata container.
  *
  * @author Chris Mattmann
  * @author Jérôme Charron
+ *
  */
-public class Metadata implements CreativeCommons,
-                                 DublinCore,
-                                 HttpHeaders,
-                                 Nutch,
-                                 Office,
-                                 Writable {
-  
-
-  /** Used to format DC dates for the DATE metadata field */
-  public final static SimpleDateFormat DATE_FORMAT = 
-          new SimpleDateFormat("yyyy-MM-dd");
-  
-    
-  private final static Map NAMES_IDX = new HashMap();
-  private static String[] normalized = null;
-
-  // Uses self introspection to fill the metanames index and the
-  // metanames list.
-  static {
-    Field[] fields = Metadata.class.getFields();
-    for (int i=0; i<fields.length; i++) {
-      int mods = fields[i].getModifiers();
-      if (Modifier.isFinal(mods) &&
-          Modifier.isPublic(mods) &&
-          Modifier.isStatic(mods) &&
-          fields[i].getType().equals(String.class)) {
-        try {
-          String val = (String) fields[i].get(null);
-          NAMES_IDX.put(normalize(val), val);
-        } catch (Exception e) {
-          // Simply ignore...
-        }
-      }
-    }
-    normalized = (String[]) NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
-  }
-  
-  
-  /** A map of all metadata attributes */
-  private Map metadata = null;
+public class Metadata implements Writable, CreativeCommons,
+DublinCore, HttpHeaders, Nutch, Office {
+
+  /**
+   * A map of all metadata attributes.
+   */
+  private Map<String, String[]> metadata = null;
+
 
-  
-  /** Constructs a new, empty metadata. */
+  /**
+   * Constructs a new, empty metadata.
+   */
   public Metadata() {
-    metadata = new HashMap();
+    metadata = new HashMap<String, String[]>();
   }
 
   /**
+   * Returns true if named value is multivalued.
+   * @param name name of metadata
+   * @return true is named value is multivalued, false if single
+   * value or null
    */
-  public boolean isMultiValued(String name) {
-    return getValues(name).length > 1;
+  public boolean isMultiValued(final String name) {
+    return metadata.get(name) != null && metadata.get(name).length > 1;
   }
 
   /**
    * Returns an array of the names contained in the metadata.
+   * @return Metadata names
    */
   public String[] names() {
-    Iterator iter = metadata.keySet().iterator();
-    List names = new ArrayList();
-    while(iter.hasNext()) {
-      names.add(getNormalizedName((String) iter.next()));
-    }
-    return (String[]) names.toArray(new String[names.size()]);
+    return metadata.keySet().toArray(new String[metadata.keySet().size()]);
   }
-  
+
   /**
    * Get the value associated to a metadata name.
    * If many values are assiociated to the specified name, then the first
@@ -121,12 +76,12 @@
    * @param name of the metadata.
    * @return the value associated to the specified metadata name.
    */
-  public String get(String name) {
-    Object values = metadata.get(getNormalizedName(name));
-    if ((values != null) && (values instanceof List)) {
-      return (String) ((List) values).get(0);
+  public String get(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      return null;
     } else {
-      return (String) values;
+      return values[0];
     }
   }
 
@@ -135,19 +90,14 @@
    * @param name of the metadata.
    * @return the values associated to a metadata name.
    */
-  public String[] getValues(String name) {
-    Object values = metadata.get(getNormalizedName(name));
-    if (values != null) {
-      if (values instanceof List) {
-        List list = (List) values;
-        return (String[]) list.toArray(new String[list.size()]);
-      } else {
-        return new String[] { (String) values };
-      }
+  public String[] getValues(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      values = new String[0];
     }
-    return new String[0];
+    return values;
   }
-  
+
   /**
    * Add a metadata name/value mapping.
    * Add the specified value to the list of values associated to the
@@ -156,31 +106,30 @@
    * @param name the metadata name.
    * @param value the metadata value.
    */
-  public void add(String name, String value) {
-    String normalized = getNormalizedName(name);
-    Object values = metadata.get(normalized);
-    if (values != null) {
-      if (values instanceof String) {
-        List list = new ArrayList();
-        list.add(values);
-        list.add(value);
-        metadata.put(normalized, list);
-      } else if (values instanceof List) {
-        ((List) values).add(value);
-      }
+  public void add(final String name, final String value) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      set(name, value);
     } else {
-      metadata.put(normalized, value);
+      String[] newValues = new String[values.length + 1];
+      System.arraycopy(values, 0, newValues, 0, values.length);
+      newValues[newValues.length - 1] = value;
+      metadata.put(name, newValues);
     }
   }
 
+  /**
+   * Copy All key-value pairs from properties.
+   * @param properties properties to copy from
+   */
   public void setAll(Properties properties) {
     Enumeration names = properties.propertyNames();
     while (names.hasMoreElements()) {
       String name = (String) names.nextElement();
-      set(name, properties.getProperty(name));
+      metadata.put(name, new String[]{properties.getProperty(name)});
     }
   }
-  
+
   /**
    * Set metadata name/value.
    * Associate the specified value to the specified metadata name. If some
@@ -190,46 +139,46 @@
    * @param value the metadata value.
    */
   public void set(String name, String value) {
-    remove(name);
-    add(name, value);
+    metadata.put(name, new String[]{value});
   }
 
   /**
    * Remove a metadata and all its associated values.
+   * @param name metadata name to remove
    */
   public void remove(String name) {
-    metadata.remove(getNormalizedName(name));
+    metadata.remove(name);
   }
-  
+
   /**
    * Returns the number of metadata names in this metadata.
+   * @return number of metadata names
    */
   public int size() {
     return metadata.size();
   }
-  
-  // Inherited Javadoc
+
   public boolean equals(Object o) {
-    
+
     if (o == null) { return false; }
-    
+
     Metadata other = null;
     try {
       other = (Metadata) o;
     } catch (ClassCastException cce) {
       return false;
     }
-    
+
     if (other.size() != size()) { return false; }
-    
+
     String[] names = names();
-    for (int i=0; i<names.length; i++) {
+    for (int i = 0; i < names.length; i++) {
       String[] otherValues = other.getValues(names[i]);
       String[] thisValues = getValues(names[i]);
       if (otherValues.length != thisValues.length) {
         return false;
       }
-      for (int j=0; j<otherValues.length; j++) {
+      for (int j = 0; j < otherValues.length; j++) {
         if (!otherValues[j].equals(thisValues[j])) {
           return false;
         }
@@ -238,13 +187,12 @@
     return true;
   }
 
-  // Inherited Javadoc
   public String toString() {
     StringBuffer buf = new StringBuffer();
     String[] names = names();
-    for (int i=0; i<names.length; i++) {
+    for (int i = 0; i < names.length; i++) {
       String[] values = getValues(names[i]);
-      for (int j=0; j<values.length; j++) {
+      for (int j = 0; j < values.length; j++) {
         buf.append(names[i])
            .append("=")
            .append(values[j])
@@ -253,83 +201,31 @@
     }
     return buf.toString();
   }
-  
-  
-  /**
-   * Get the normalized name of metadata attribute name.
-   * This method tries to find a well-known metadata name (one of the
-   * metadata names defined in this class) that matches the specified name.
-   * The matching is error tolerent. For instance,
-   * <ul>
-   *  <li>content-type gives Content-Type</li>
-   *  <li>CoNtEntType  gives Content-Type</li>
-   *  <li>ConTnTtYpe   gives Content-Type</li>
-   * </ul>
-   * If no matching with a well-known metadata name is found, then the original
-   * name is returned.
-   */
-  public static String getNormalizedName(String name) {
-    String searched = normalize(name);
-    String value = (String) NAMES_IDX.get(searched);
-
-    if ((value == null) && (normalized != null)) {
-      int threshold = searched.length() / 3;
-      for (int i=0; i<normalized.length && value == null; i++) {
-        if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
-          value = (String) NAMES_IDX.get(normalized[i]);
-        }
-      }
-    }
-    return (value != null) ? value : name;
-  }
-    
-  private final static String normalize(String str) {
-    char c;
-    StringBuffer buf = new StringBuffer();
-    for (int i=0; i<str.length(); i++) {
-      c = str.charAt(i);
-      if (Character.isLetter(c)) {
-        buf.append(Character.toLowerCase(c));
-      }
-    }
-    return buf.toString();
-  }
 
-  
-  /* ------------------------- *
-   * <implementation:Writable> *
-   * ------------------------- */
-  
-  // Inherited Javadoc
   public final void write(DataOutput out) throws IOException {
     out.writeInt(size());
     String[] values = null;
     String[] names = names();
-    for (int i=0; i<names.length; i++) {
+    for (int i = 0; i < names.length; i++) {
       Text.writeString(out, names[i]);
       values = getValues(names[i]);
       out.writeInt(values.length);
-      for (int j=0; j<values.length; j++) {
+      for (int j = 0; j < values.length; j++) {
         Text.writeString(out, values[j]);
       }
     }
   }
 
-  // Inherited Javadoc
   public final void readFields(DataInput in) throws IOException {
     int keySize = in.readInt();
     String key;
-    for (int i=0; i<keySize; i++) {
+    for (int i = 0; i < keySize; i++) {
       key = Text.readString(in);
       int valueSize = in.readInt();
-      for (int j=0; j<valueSize; j++) {
+      for (int j = 0; j < valueSize; j++) {
         add(key, Text.readString(in));
       }
     }
   }
 
-  /* -------------------------- *
-   * </implementation:Writable> *
-   * -------------------------- */
-   
 }

Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?view=auto&rev=474464
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Mon Nov 13 11:46:56 2006
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * A decorator to Metadata that adds spellchecking capabilities to property
+ * names.
+ *
+ * All the static String fields declared by this class are used as reference
+ * names for syntax correction on meta-data naming.
+ */
+public class SpellCheckedMetadata extends Metadata {
+
+  /**
+   * Treshold divider.
+   *
+   * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+   */
+  private static final int TRESHOLD_DIVIDER = 3;
+
+  /**
+   * Normalized name to name mapping.
+   */
+  private final static Map<String, String> NAMES_IDX = new HashMap<String, String>();
+
+  /**
+   * Array holding map keys.
+   */
+  private static String[] normalized = null;
+
+  // Uses self introspection to fill the metanames index and the
+  // metanames list.
+  static {
+    for (Field field : SpellCheckedMetadata.class.getFields()) {
+      int mods = field.getModifiers();
+      if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+          && Modifier.isStatic(mods) && field.getType().equals(String.class)) {
+        try {
+          String val = (String) field.get(null);
+          NAMES_IDX.put(normalize(val), val);
+        } catch (Exception e) {
+          // Simply ignore...
+        }
+      }
+    }
+    normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
+  }
+
+  /**
+   * Normalizes String.
+   *
+   * @param str
+   *          the string to normalize
+   * @return normalized String
+   */
+  private static String normalize(final String str) {
+    char c;
+    StringBuffer buf = new StringBuffer();
+    for (int i = 0; i < str.length(); i++) {
+      c = str.charAt(i);
+      if (Character.isLetter(c)) {
+        buf.append(Character.toLowerCase(c));
+      }
+    }
+    return buf.toString();
+  }
+
+  /**
+   * Get the normalized name of metadata attribute name. This method tries to
+   * find a well-known metadata name (one of the metadata names defined in this
+   * class) that matches the specified name. The matching is error tolerent. For
+   * instance,
+   * <ul>
+   * <li>content-type gives Content-Type</li>
+   * <li>CoNtEntType gives Content-Type</li>
+   * <li>ConTnTtYpe gives Content-Type</li>
+   * </ul>
+   * If no matching with a well-known metadata name is found, then the original
+   * name is returned.
+   *
+   * @param name
+   *          Name to normalize
+   * @return normalized name
+   */
+  public static String getNormalizedName(final String name) {
+    String searched = normalize(name);
+    String value = NAMES_IDX.get(searched);
+
+    if ((value == null) && (normalized != null)) {
+      int threshold = searched.length() / TRESHOLD_DIVIDER;
+      for (int i = 0; i < normalized.length && value == null; i++) {
+        if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
+          value = NAMES_IDX.get(normalized[i]);
+        }
+      }
+    }
+    return (value != null) ? value : name;
+  }
+
+  @Override
+  public void remove(final String name) {
+    super.remove(getNormalizedName(name));
+  }
+
+  @Override
+  public void add(final String name, final String value) {
+    String normalized = getNormalizedName(name);
+    super.add(normalized, value);
+  }
+
+  @Override
+  public String[] getValues(final String name) {
+    return super.getValues(getNormalizedName(name));
+  }
+
+  @Override
+  public String get(final String name) {
+    return super.get(getNormalizedName(name));
+  }
+
+  @Override
+  public void set(final String name, final String value) {
+    super.set(getNormalizedName(name), value);
+  }
+
+}

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Nov 13 11:46:56 2006
@@ -17,17 +17,25 @@
 
 package org.apache.nutch.protocol;
 
-import java.util.*;
-import java.io.*;
-
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.*;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ArrayFile;
+import org.apache.hadoop.io.CompressedWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 import org.apache.nutch.util.mime.MimeTypeException;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.mime.MimeTypes;
 
 public final class Content extends CompressedWritable {
 
@@ -36,23 +44,38 @@
   private final static byte VERSION = 2;
 
   private byte version;
+
   private String url;
+
   private String base;
+
   private byte[] content;
+
   private String contentType;
+
   private Metadata metadata;
+
   private boolean mimeTypeMagic;
+
   private MimeTypes mimeTypes;
 
-  public Content() {}
-    
+  private boolean inflated;
+
+  public Content() {
+    inflated = false;
+  }
+
   public Content(String url, String base, byte[] content, String contentType,
-                 Metadata metadata, Configuration conf) {
+      Metadata metadata, Configuration conf) {
 
-    if (url == null) throw new IllegalArgumentException("null url");
-    if (base == null) throw new IllegalArgumentException("null base");
-    if (content == null) throw new IllegalArgumentException("null content");
-    if (metadata == null) throw new IllegalArgumentException("null metadata");
+    if (url == null)
+      throw new IllegalArgumentException("null url");
+    if (base == null)
+      throw new IllegalArgumentException("null base");
+    if (content == null)
+      throw new IllegalArgumentException("null content");
+    if (metadata == null)
+      throw new IllegalArgumentException("null metadata");
 
     this.url = url;
     this.base = base;
@@ -61,21 +84,30 @@
     this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
     this.mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
     this.contentType = getContentType(contentType, url, content);
+    inflated = true;
+  }
+
+  public void ensureInflated() {
+    if (inflated) {
+      return;
+    }
+    super.ensureInflated();
+    inflated = true;
   }
 
   protected final void readFieldsCompressed(DataInput in) throws IOException {
     version = in.readByte();
-    metadata = new Metadata();
+    metadata = new SpellCheckedMetadata();
     switch (version) {
     case 0:
     case 1:
-      url = UTF8.readString(in);                    // read url
-      base = UTF8.readString(in);                   // read base
+      url = UTF8.readString(in); // read url
+      base = UTF8.readString(in); // read base
 
-      content = new byte[in.readInt()];             // read content
+      content = new byte[in.readInt()]; // read content
       in.readFully(content);
 
-      contentType = UTF8.readString(in);            // read contentType
+      contentType = UTF8.readString(in); // read contentType
       // reconstruct metadata
       int keySize = in.readInt();
       String key;
@@ -88,33 +120,33 @@
       }
       break;
     case VERSION:
-      url = Text.readString(in);                    // read url
-      base = Text.readString(in);                   // read base
+      url = Text.readString(in); // read url
+      base = Text.readString(in); // read base
 
-      content = new byte[in.readInt()];             // read content
+      content = new byte[in.readInt()]; // read content
       in.readFully(content);
 
-      contentType = Text.readString(in);            // read contentType
-      metadata.readFields(in);                    // read meta data
+      contentType = Text.readString(in); // read contentType
+      metadata.readFields(in); // read meta data
       break;
     default:
       throw new VersionMismatchException(VERSION, version);
     }
-    
+
   }
 
   protected final void writeCompressed(DataOutput out) throws IOException {
     out.writeByte(VERSION);
 
-    Text.writeString(out, url);                   // write url
-    Text.writeString(out, base);                  // write base
+    Text.writeString(out, url); // write url
+    Text.writeString(out, base); // write base
 
-    out.writeInt(content.length);                 // write content
+    out.writeInt(content.length); // write content
     out.write(content);
 
-    Text.writeString(out, contentType);           // write contentType
-    
-    metadata.write(out);                           // write metadata
+    Text.writeString(out, contentType); // write contentType
+
+    metadata.write(out); // write metadata
   }
 
   public static Content read(DataInput in) throws IOException {
@@ -146,6 +178,7 @@
     ensureInflated();
     return content;
   }
+
   public void setContent(byte[] content) {
     ensureInflated();
     this.content = content;
@@ -159,6 +192,7 @@
     ensureInflated();
     return contentType;
   }
+
   public void setContentType(String contentType) {
     ensureInflated();
     this.contentType = contentType;
@@ -178,30 +212,28 @@
 
   public boolean equals(Object o) {
     ensureInflated();
-    if (!(o instanceof Content)){
+    if (!(o instanceof Content)) {
       return false;
     }
-    Content that = (Content)o;
+    Content that = (Content) o;
     that.ensureInflated();
-    return
-      this.url.equals(that.url) &&
-      this.base.equals(that.base) &&
-      Arrays.equals(this.getContent(), that.getContent()) &&
-      this.contentType.equals(that.contentType) &&
-      this.metadata.equals(that.metadata);
+    return this.url.equals(that.url) && this.base.equals(that.base)
+        && Arrays.equals(this.getContent(), that.getContent())
+        && this.contentType.equals(that.contentType)
+        && this.metadata.equals(that.metadata);
   }
 
   public String toString() {
     ensureInflated();
     StringBuffer buffer = new StringBuffer();
 
-    buffer.append("Version: " + version + "\n" );
-    buffer.append("url: " + url + "\n" );
-    buffer.append("base: " + base + "\n" );
-    buffer.append("contentType: " + contentType + "\n" );
-    buffer.append("metadata: " + metadata + "\n" );
+    buffer.append("Version: " + version + "\n");
+    buffer.append("url: " + url + "\n");
+    buffer.append("base: " + base + "\n");
+    buffer.append("contentType: " + contentType + "\n");
+    buffer.append("metadata: " + metadata + "\n");
     buffer.append("Content:\n");
-    buffer.append(new String(content));           // try default encoding
+    buffer.append(new String(content)); // try default encoding
 
     return buffer.toString();
 
@@ -210,7 +242,7 @@
   public static void main(String argv[]) throws Exception {
 
     String usage = "Content (-local | -dfs <namenode:port>) recno segment";
-    
+
     if (argv.length < 3) {
       System.out.println("usage:" + usage);
       return;
@@ -224,7 +256,8 @@
       Path file = new Path(segment, DIR_NAME);
       System.out.println("Reading from file: " + file);
 
-      ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf);
+      ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
+          conf);
 
       Content content = new Content();
       contents.get(recno, content);
@@ -241,10 +274,10 @@
   private String getContentType(String typeName, String url, byte[] data) {
     MimeType type = null;
     try {
-        typeName = MimeType.clean(typeName);
-        type = typeName == null ? null : this.mimeTypes.forName(typeName);
+      typeName = MimeType.clean(typeName);
+      type = typeName == null ? null : this.mimeTypes.forName(typeName);
     } catch (MimeTypeException mte) {
-        // Seems to be a malformed mime type name...
+      // Seems to be a malformed mime type name...
     }
 
     if (typeName == null || type == null || !type.matches(url)) {
@@ -254,8 +287,8 @@
       type = this.mimeTypes.getMimeType(url);
       typeName = type == null ? typeName : type.getName();
     }
-    if (typeName == null || type == null ||
-        (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
+    if (typeName == null || type == null
+        || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
       // If no mime-type already found, or the one found doesn't match
       // the magic bytes it should be, then, guess a mime-type from the
       // document content (magic bytes)

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java Mon Nov 13 11:46:56 2006
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.metadata;
 
-// JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.DataInputStream;
@@ -24,71 +23,52 @@
 import java.io.IOException;
 import java.util.Properties;
 import junit.framework.Test;
-
-// JUnit imports
 import junit.framework.TestCase;
 import junit.framework.TestSuite;
 import junit.textui.TestRunner;
 
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-
-
 /**
  * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.
- *
- * @author Chris Mattmann
- * @author J&eacute;r&ocirc;me Charron
  */
 public class TestMetadata extends TestCase {
 
-  
+  private static final String CONTENTTYPE = "contenttype";
+
   public TestMetadata(String testName) {
     super(testName);
   }
-  
+
   public static Test suite() {
     return new TestSuite(TestMetadata.class);
   }
-  
+
   public static void main(String[] args) {
     TestRunner.run(suite());
   }
-  
-
-  /** Test for the <code>getNormalizedName(String)</code> method. */
-  public void testGetNormalizedName() {
-    assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type"));
-    assertEquals("Content-Type", Metadata.getNormalizedName("ContentType"));
-    assertEquals("Content-Type", Metadata.getNormalizedName("Content-type"));
-    assertEquals("Content-Type", Metadata.getNormalizedName("contenttype"));
-    assertEquals("Content-Type", Metadata.getNormalizedName("contentype"));
-    assertEquals("Content-Type", Metadata.getNormalizedName("contntype"));
-  }
 
   /** Test for the <code>add(String, String)</code> method. */
   public void testAdd() {
     String[] values = null;
     Metadata meta = new Metadata();
 
-    values = meta.getValues("contentype");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(0, values.length);
 
-    meta.add("contentype", "value1");
-    values = meta.getValues("contentype");
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(1, values.length);
     assertEquals("value1", values[0]);
-    
-    meta.add("Content-Type", "value2");
-    values = meta.getValues("contentype");
+
+    meta.add(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(2, values.length);
     assertEquals("value1", values[0]);
     assertEquals("value2", values[1]);
 
     // NOTE : For now, the same value can be added many times.
-    //        Should it be changed?
-    meta.add("ContentType", "value1");
-    values = meta.getValues("Content-Type");
+    // Should it be changed?
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(3, values.length);
     assertEquals("value1", values[0]);
     assertEquals("value2", values[1]);
@@ -100,43 +80,43 @@
     String[] values = null;
     Metadata meta = new Metadata();
 
-    values = meta.getValues("contentype");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(0, values.length);
 
-    meta.set("contentype", "value1");
-    values = meta.getValues("contentype");
+    meta.set(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(1, values.length);
     assertEquals("value1", values[0]);
-    
-    meta.set("Content-Type", "value2");
-    values = meta.getValues("contentype");
+
+    meta.set(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(1, values.length);
     assertEquals("value2", values[0]);
-    
-    meta.set("contenttype", "new value 1");
+
+    meta.set(CONTENTTYPE, "new value 1");
     meta.add("contenttype", "new value 2");
-    values = meta.getValues("contentype");
+    values = meta.getValues(CONTENTTYPE);
     assertEquals(2, values.length);
     assertEquals("new value 1", values[0]);
     assertEquals("new value 2", values[1]);
   }
-  
-  /** Test for <code>setAll(Properties)</code> method */
+
+  /** Test for <code>setAll(Properties)</code> method. */
   public void testSetProperties() {
     String[] values = null;
     Metadata meta = new Metadata();
     Properties props = new Properties();
-    
+
     meta.setAll(props);
     assertEquals(0, meta.size());
-    
+
     props.setProperty("name-one", "value1.1");
     meta.setAll(props);
     assertEquals(1, meta.size());
     values = meta.getValues("name-one");
     assertEquals(1, values.length);
     assertEquals("value1.1", values[0]);
-    
+
     props.setProperty("name-two", "value2.1");
     meta.setAll(props);
     assertEquals(2, meta.size());
@@ -147,19 +127,18 @@
     assertEquals(1, values.length);
     assertEquals("value2.1", values[0]);
   }
-    
-  /** Test for <code>get(String)</code> method */
+
+  /** Test for <code>get(String)</code> method. */
   public void testGet() {
     Metadata meta = new Metadata();
     assertNull(meta.get("a-name"));
-    
     meta.add("a-name", "value-1");
     assertEquals("value-1", meta.get("a-name"));
     meta.add("a-name", "value-2");
     assertEquals("value-1", meta.get("a-name"));
   }
-    
-  /** Test for <code>isMultiValued()</code> method */
+
+  /** Test for <code>isMultiValued()</code> method. */
   public void testIsMultiValued() {
     Metadata meta = new Metadata();
     assertFalse(meta.isMultiValued("key"));
@@ -169,13 +148,13 @@
     assertTrue(meta.isMultiValued("key"));
   }
 
-  /** Test for <code>names</code> method */
+  /** Test for <code>names</code> method. */
   public void testNames() {
     String[] names = null;
     Metadata meta = new Metadata();
     names = meta.names();
     assertEquals(0, names.length);
-    
+
     meta.add("name-one", "value");
     names = meta.names();
     assertEquals(1, names.length);
@@ -184,8 +163,8 @@
     names = meta.names();
     assertEquals(2, names.length);
   }
-  
-  /** Test for <code>remove(String)</code> method */
+
+  /** Test for <code>remove(String)</code> method. */
   public void testRemove() {
     Metadata meta = new Metadata();
     meta.remove("name-one");
@@ -206,7 +185,7 @@
     assertNull(meta.get("name-two"));
   }
 
-  /** Test for <code>equals(Object)</code> method */
+  /** Test for <code>equals(Object)</code> method. */
   public void testObject() {
     Metadata meta1 = new Metadata();
     Metadata meta2 = new Metadata();
@@ -230,8 +209,8 @@
     meta2.add("name-two", "value-2.x");
     assertFalse(meta1.equals(meta2));
   }
-  
-  /** Test for <code>Writable</code> implementation */
+
+  /** Test for <code>Writable</code> implementation. */
   public void testWritable() {
     Metadata result = null;
     Metadata meta = new Metadata();
@@ -252,17 +231,19 @@
     assertEquals("value-2.1", result.getValues("name-two")[0]);
     assertEquals("value-2.2", result.getValues("name-two")[1]);
   }
-  
+
   private Metadata writeRead(Metadata meta) {
     Metadata readed = new Metadata();
     try {
       ByteArrayOutputStream out = new ByteArrayOutputStream();
       meta.write(new DataOutputStream(out));
-      readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
+      readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+          .toByteArray())));
     } catch (IOException ioe) {
       fail(ioe.toString());
     }
     return readed;
   }
-  	
+
 }
+

Added: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?view=auto&rev=474464
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Mon Nov 13 11:46:56 2006
@@ -0,0 +1,267 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.metadata.SpellCheckedMetadata}.
+ *
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestSpellCheckedMetadata extends TestCase {
+
+  public TestSpellCheckedMetadata(String testName) {
+    super(testName);
+  }
+
+  public static Test suite() {
+    return new TestSuite(TestSpellCheckedMetadata.class);
+  }
+
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  /** Test for the <code>getNormalizedName(String)</code> method. */
+  public void testGetNormalizedName() {
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("Content-Type"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("ContentType"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("Content-type"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("contenttype"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("contentype"));
+    assertEquals("Content-Type", SpellCheckedMetadata
+        .getNormalizedName("contntype"));
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  public void testAdd() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    assertEquals(0, values.length);
+
+    meta.add("contentype", "value1");
+    values = meta.getValues("contentype");
+    assertEquals(1, values.length);
+    assertEquals("value1", values[0]);
+
+    meta.add("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    assertEquals(2, values.length);
+    assertEquals("value1", values[0]);
+    assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add("ContentType", "value1");
+    values = meta.getValues("Content-Type");
+    assertEquals(3, values.length);
+    assertEquals("value1", values[0]);
+    assertEquals("value2", values[1]);
+    assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  public void testSet() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    assertEquals(0, values.length);
+
+    meta.set("contentype", "value1");
+    values = meta.getValues("contentype");
+    assertEquals(1, values.length);
+    assertEquals("value1", values[0]);
+
+    meta.set("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    assertEquals(1, values.length);
+    assertEquals("value2", values[0]);
+
+    meta.set("contenttype", "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues("contentype");
+    assertEquals(2, values.length);
+    assertEquals("new value 1", values[0]);
+    assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  public void testSetProperties() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    assertEquals(1, values.length);
+    assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    assertEquals(1, values.length);
+    assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    assertEquals(1, values.length);
+    assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  public void testGet() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    assertNull(meta.get("a-name"));
+
+    meta.add("a-name", "value-1");
+    assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  public void testIsMultiValued() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  public void testNames() {
+    String[] names = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    names = meta.names();
+    assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    assertEquals(1, names.length);
+    assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  public void testRemove() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    meta.remove("name-one");
+    assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    assertEquals(2, meta.size());
+    assertNotNull(meta.get("name-one"));
+    assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    assertEquals(1, meta.size());
+    assertNull(meta.get("name-one"));
+    assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    assertEquals(0, meta.size());
+    assertNull(meta.get("name-one"));
+    assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  public void testObject() {
+    SpellCheckedMetadata meta1 = new SpellCheckedMetadata();
+    SpellCheckedMetadata meta2 = new SpellCheckedMetadata();
+    assertFalse(meta1.equals(null));
+    assertFalse(meta1.equals("String"));
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    assertFalse(meta1.equals(meta2));
+  }
+
+  /** Test for <code>Writable</code> implementation. */
+  public void testWritable() {
+    SpellCheckedMetadata result = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    result = writeRead(meta);
+    assertEquals(0, result.size());
+    meta.add("name-one", "value-1.1");
+    result = writeRead(meta);
+    assertEquals(1, result.size());
+    assertEquals(1, result.getValues("name-one").length);
+    assertEquals("value-1.1", result.get("name-one"));
+    meta.add("name-two", "value-2.1");
+    meta.add("name-two", "value-2.2");
+    result = writeRead(meta);
+    assertEquals(2, result.size());
+    assertEquals(1, result.getValues("name-one").length);
+    assertEquals("value-1.1", result.getValues("name-one")[0]);
+    assertEquals(2, result.getValues("name-two").length);
+    assertEquals("value-2.1", result.getValues("name-two")[0]);
+    assertEquals("value-2.2", result.getValues("name-two")[1]);
+  }
+
+  private SpellCheckedMetadata writeRead(SpellCheckedMetadata meta) {
+    SpellCheckedMetadata readed = new SpellCheckedMetadata();
+    try {
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      meta.write(new DataOutputStream(out));
+      readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+          .toByteArray())));
+    } catch (IOException ioe) {
+      fail(ioe.toString());
+    }
+    return readed;
+  }
+
+}

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Nov 13 11:46:56 2006
@@ -18,6 +18,7 @@
 package org.apache.nutch.protocol;
 
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.WritableTestUtils;
@@ -28,9 +29,9 @@
 /** Unit tests for Content. */
 
 public class TestContent extends TestCase {
-    
+
   private static Configuration conf = NutchConfiguration.create();
-    
+
   public TestContent(String name) { super(name); }
 
   public void testContent() throws Exception {
@@ -39,13 +40,13 @@
 
     String url = "http://www.foo.com/";
 
-    Metadata metaData = new Metadata();
+    SpellCheckedMetadata metaData = new SpellCheckedMetadata();
     metaData.add("Host", "www.foo.com");
     metaData.add("Content-Type", "text/html");
 
     Content r = new Content(url, url, page.getBytes("UTF8"), "text/html",
                             metaData, conf);
-                        
+
     WritableTestUtils.testWritable(r);
     assertEquals("text/html", r.getMetadata().get("Content-Type"));
     assertEquals("text/html", r.getMetadata().get("content-type"));
@@ -105,5 +106,5 @@
                     null, p, conf);
     assertNull(c.getContentType());
   }
-	
+
 }