You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/12/15 19:49:31 UTC

svn commit: r357056 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ java/org/apache/nutch/servlet/ java/org/apache/nutch/tools/ java/org/ap...

Author: cutting
Date: Thu Dec 15 10:49:12 2005
New Revision: 357056

URL: http://svn.apache.org/viewcvs?rev=357056&view=rev
Log:
svn merge -r 326936:357049 https://svn.apache.org/repos/asf/lucene/nutch/trunk

Added:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ContentProperties.java
      - copied unchanged from r356540, lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java
    lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
    lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
    lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
    lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
    lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
    lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
    lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java
    lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java
    lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
    lucene/nutch/branches/mapred/src/web/jsp/cached.jsp

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Dec 15 10:49:12 2005
@@ -18,7 +18,6 @@
 
 import java.io.IOException;
 import java.io.File;
-import java.util.Properties;
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
@@ -193,7 +192,7 @@
 
       if (content == null) {
         String url = key.toString();
-        content = new Content(url, url, new byte[0], "", new Properties());
+        content = new Content(url,url,new byte[0],"",new ContentProperties());
       }
 
       content.getMetadata().setProperty           // add digest to metadata

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Thu Dec 15 10:49:12 2005
@@ -26,6 +26,7 @@
 import org.apache.nutch.util.*;
 import org.apache.nutch.mapred.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.*;
 import org.apache.nutch.analysis.*;
 
 import org.apache.nutch.indexer.*;
@@ -187,7 +188,7 @@
     }
 
     Document doc = new Document();
-    Properties meta = parseData.getMetadata();
+    ContentProperties meta = parseData.getMetadata();
     String[] anchors = inlinks!=null ? inlinks.getAnchors() : new String[0];
 
     // add segment, used to map from merged index back to segment files

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Dec 15 10:49:12 2005
@@ -234,7 +234,7 @@
       MD5Hash hash = null;
       String url = fle.getPage().getURL().toString();
       if (content == null) {
-        content = new Content(url, url, new byte[0], "", new Properties());
+        content = new Content(url, url, new byte[0], "", new ContentProperties());
         hash = MD5Hash.digest(url);
       } else {
         hash = MD5Hash.digest(content.getContent());
@@ -263,7 +263,7 @@
                 + status.toString());
         outputPage(new FetcherOutput(fle, hash, protocolStatus),
                 content, new ParseText(""),
-                new ParseData(status, "", new Outlink[0], new Properties()));
+                new ParseData(status, "", new Outlink[0], new ContentProperties()));
       }
       return status;
     }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java Thu Dec 15 10:49:12 2005
@@ -21,6 +21,7 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 
 
@@ -34,12 +35,12 @@
 
   private String title;
   private Outlink[] outlinks;
-  private Properties metadata;
+  private ContentProperties metadata;
   private ParseStatus status;
 
   public ParseData() {}
 
-  public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) {
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks, ContentProperties metadata) {
     this.status = status;
     this.title = title;
     this.outlinks = outlinks;
@@ -62,7 +63,7 @@
   /** Other page properties.  This is the place to find format-specific
    * properties.  Different parser implementations for different content types
    * will populate this differently. */
-  public Properties getMetadata() { return metadata; }
+  public ContentProperties getMetadata() { return metadata; }
 
   /** Return the value of a metadata property. */
   public String get(String name) { return getMetadata().getProperty(name); }
@@ -94,7 +95,7 @@
     }
     
     int propertyCount = in.readInt();             // read metadata
-    metadata = new Properties();
+    metadata = new ContentProperties();
     for (int i = 0; i < propertyCount; i++) {
       metadata.put(UTF8.readString(in), UTF8.readString(in));
     }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java Thu Dec 15 10:49:12 2005
@@ -12,6 +12,7 @@
 
 import org.apache.nutch.io.VersionedWritable;
 import org.apache.nutch.io.WritableUtils;
+import org.apache.nutch.protocol.ContentProperties;
 
 /**
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
@@ -230,7 +231,7 @@
     private ParseData data = null;
     
     public EmptyParseImpl(ParseStatus status) {
-      data = new ParseData(status, "", new Outlink[0], new Properties());
+      data = new ParseData(status, "", new Outlink[0], new ContentProperties());
     }
     
     public ParseData getData() {

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Thu Dec 15 10:49:12 2005
@@ -22,6 +22,9 @@
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.util.mime.MimeTypeException;
 
 public final class Content extends CompressedWritable {
 
@@ -29,28 +32,35 @@
 
   private final static byte VERSION = 1;
 
+  /** A flag that tells if magic resolution must be performed */
+  private final static boolean MAGIC =
+    NutchConf.get().getBoolean("mime.type.magic", true);
+  
+  /** Get the MimeTypes resolver instance. */
+  private final static MimeTypes MIME = 
+    MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
   private byte version;
   private String url;
   private String base;
   private byte[] content;
   private String contentType;
-  private Properties metadata;
+  private ContentProperties metadata;
 
   public Content() {}
     
   public Content(String url, String base, byte[] content, String contentType,
-                 Properties metadata){
+                 ContentProperties metadata){
 
     if (url == null) throw new IllegalArgumentException("null url");
     if (base == null) throw new IllegalArgumentException("null base");
     if (content == null) throw new IllegalArgumentException("null content");
-    if (contentType == null) throw new IllegalArgumentException("null type");
     if (metadata == null) throw new IllegalArgumentException("null metadata");
 
     this.url = url;
     this.base = base;
     this.content = content;
-    this.contentType = contentType;
+    this.contentType = getContentType(contentType, url, content);
     this.metadata = metadata;
   }
 
@@ -68,7 +78,7 @@
     contentType = UTF8.readString(in);            // read contentType
 
     int propertyCount = in.readInt();             // read metadata
-    metadata = new Properties();
+    metadata = new ContentProperties();
     for (int i = 0; i < propertyCount; i++) {
       metadata.put(UTF8.readString(in), UTF8.readString(in));
     }
@@ -142,7 +152,7 @@
   }
 
   /** Other protocol-specific data. */
-  public Properties getMetadata() {
+  public ContentProperties getMetadata() {
     ensureInflated();
     return metadata;
   }
@@ -213,4 +223,33 @@
       nfs.close();
     }
   }
+
+  private String getContentType(String typeName, String url, byte[] data) {
+    
+    MimeType type = null;
+    try {
+        typeName = MimeType.clean(typeName);
+        type = typeName == null ? null : MIME.forName(typeName);
+    } catch (MimeTypeException mte) {
+        // Seems to be a malformed mime type name...
+    }
+
+    if (typeName == null || type == null || !type.matches(url)) {
+      // If no mime-type header, or cannot find a corresponding registered
+      // mime-type, or the one found doesn't match the url pattern
+      // it shouldbe, then guess a mime-type from the url pattern
+      type = MIME.getMimeType(url);
+      typeName = type == null ? typeName : type.getName();
+    }
+    if (typeName == null || type == null ||
+        (MAGIC && type.hasMagic() && !type.matches(data))) {
+      // If no mime-type already found, or the one found doesn't match
+      // the magic bytes it should be, then, guess a mime-type from the
+      // document content (magic bytes)
+      type = MIME.getMimeType(data);
+      typeName = type == null ? typeName : type.getName();
+    }
+    return typeName;
+  }
+
 }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java Thu Dec 15 10:49:12 2005
@@ -16,6 +16,7 @@
 
 package org.apache.nutch.servlet;
 
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.searcher.NutchBean;
 import org.apache.nutch.searcher.Hit;
 import org.apache.nutch.searcher.HitDetails;
@@ -76,7 +77,7 @@
     byte[] bytes = bean.getContent(details);
 
     // pass all original headers? only these for now.
-    Properties metaData = bean.getParseData(details).getMetadata();
+    ContentProperties metaData = bean.getParseData(details).getMetadata();
     String contentType = (String) metaData.get("Content-Type");
     //String lastModified = (String) metaData.get("Last-Modified");
     //String contentLength = (String) metaData.get("Content-Length");

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java Thu Dec 15 10:49:12 2005
@@ -240,7 +240,7 @@
         }
         outputPage(new ParseText(""),
                 new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
-                        "", new Outlink[0], new Properties()));
+                        "", new Outlink[0], new ContentProperties()));
       }
     }
 
@@ -250,7 +250,7 @@
         return;
       }
       outputPage(new ParseText(""),
-                 new ParseData(status, "", new Outlink[0], new Properties()));
+                 new ParseData(status, "", new Outlink[0], new ContentProperties()));
     }
       
     private void outputPage

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java Thu Dec 15 10:49:12 2005
@@ -227,11 +227,21 @@
         return minLength;
     }
     
-    boolean hasMagic() {
+    public boolean hasMagic() {
         return (magics.size() > 0);
     }
     
-    boolean matches(byte[] data) {
+    public boolean matches(String url) {
+        boolean match = false;
+        int index = url.lastIndexOf('.');
+        if ((index != -1) && (index < url.length()-1)) {
+            // There's an extension, so try to find if it matches mines
+            match = extensions.contains(url.substring(index + 1));
+         }
+         return match;
+    }
+
+    public boolean matches(byte[] data) {
         if (!hasMagic()) { return false; }
         
         Magic tested = null;

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java Thu Dec 15 10:49:12 2005
@@ -39,7 +39,10 @@
     public final static String DEFAULT = "application/octet-stream";
 
     /** All the registered MimeTypes */
-    private ArrayList types = new ArrayList();    
+    private ArrayList types = new ArrayList();
+
+    /** All the registered MimeType indexed by name */
+    private HashMap typesIdx = new HashMap();
 
     /** MimeTypes indexed on the file extension */
     private Map extIdx = new HashMap();
@@ -211,7 +214,14 @@
         }
         return mimeType;
     }
-    
+   
+   /**
+    * Return a MimeType from its name.
+    */
+   public MimeType forName(String name) {
+      return (MimeType) typesIdx.get(name);
+   }
+
     /**
      * Return the minimum length of data to provide to analyzing methods
      * based on the document's content in order to check all the known
@@ -241,6 +251,7 @@
      * @param type is the mime-type to add.
      */
     void add(MimeType type) {
+        typesIdx.put(type.getName(), type);
         types.add(type);
         // Update minLentgth
         minLength = Math.max(minLength, type.getMinLength());

Modified: lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Thu Dec 15 10:49:12 2005
@@ -18,6 +18,7 @@
 
 import org.apache.nutch.parse.*;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.NutchConf;
 
 import java.util.*;
@@ -51,7 +52,7 @@
     }
 
     /** Scan the document adding attributes to metadata.*/
-    public static void walk(Node doc, URL base, Properties metadata)
+    public static void walk(Node doc, URL base, ContentProperties metadata)
       throws ParseException {
 
       // walk the DOM tree, scanning for license data

Modified: lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Thu Dec 15 10:49:12 2005
@@ -19,6 +19,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 import java.util.Properties;
 import java.io.*;
@@ -56,10 +57,10 @@
     byte[] bytes = out.toByteArray();
 
     Content content =
-      new Content(url, url, bytes, contentType, new Properties());
+      new Content(url, url, bytes, contentType, new ContentProperties());
     Parse parse = ParseUtil.parseByParserId("parse-html",content);
 
-    Properties metadata = parse.getData().getMetadata();
+    ContentProperties metadata = parse.getData().getMetadata();
     assertEquals(license, metadata.get("License-Url"));
     assertEquals(location, metadata.get("License-Location"));
     assertEquals(type, metadata.get("Work-Type"));

Modified: lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Dec 15 10:49:12 2005
@@ -29,6 +29,7 @@
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.ContentProperties;
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
@@ -86,7 +87,7 @@
     String url = fo.getUrl().toString();
 
     // normalize metaData (see note in the method below).
-    Properties metaData = normalizeMeta(parse.getData().getMetadata());
+    ContentProperties metaData = normalizeMeta(parse.getData().getMetadata());
 
     addTime(doc, metaData, url, fo);
 
@@ -101,7 +102,7 @@
     
   // Add time related meta info.  Add last-modified if present.  Index date as
   // last-modified, or, if that's not present, use fetch time.
-  private Document addTime(Document doc, Properties metaData, String url,
+  private Document addTime(Document doc, ContentProperties metaData, String url,
                            FetcherOutput fo) {
     long time = -1;
 
@@ -169,7 +170,7 @@
   }
 
   // Add Content-Length
-  private Document addLength(Document doc, Properties metaData, String url) {
+  private Document addLength(Document doc, ContentProperties metaData, String url) {
     String contentLength = metaData.getProperty("content-length");
 
     if (contentLength != null)
@@ -179,7 +180,7 @@
   }
 
   // Add Content-Type and its primaryType and subType
-  private Document addType(Document doc, Properties metaData, String url) {
+  private Document addType(Document doc, ContentProperties metaData, String url) {
     MimeType mimeType = null;
     String contentType = metaData.getProperty("content-type");
     if (contentType == null) {
@@ -259,7 +260,7 @@
     }
   }
 
-  private Document resetTitle(Document doc, Properties metaData, String url) {
+  private Document resetTitle(Document doc, ContentProperties metaData, String url) {
     String contentDisposition = metaData.getProperty("content-disposition");
     if (contentDisposition == null)
       return doc;
@@ -284,8 +285,8 @@
   // (*) empty header value
   // Note: the original metaData should be kept intact,
   // because there is a benefit to preserve whatever comes from server.
-  private Properties normalizeMeta(Properties old) {
-    Properties normalized = new Properties();
+  private ContentProperties normalizeMeta(ContentProperties old) {
+      ContentProperties normalized = new ContentProperties();
 
     for (Enumeration e = old.propertyNames(); e.hasMoreElements();) {
       String key = (String) e.nextElement();

Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Thu Dec 15 10:49:12 2005
@@ -15,8 +15,7 @@
  */
 package org.apache.nutch.analysis.lang;
 
-// JDK imports
-import java.util.Properties;
+
 
 // JUnit imports
 import junit.framework.TestCase;
@@ -26,6 +25,7 @@
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 
 public class TestHTMLLanguageParser extends TestCase {
@@ -122,7 +122,7 @@
   
   
   private Content getContent(String text) {
-    Properties p = new Properties();
+    ContentProperties p = new ContentProperties();
     p.put("Content-Type", "text/html");
 
     Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);

Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Thu Dec 15 10:49:12 2005
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse.ext;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
@@ -155,7 +156,7 @@
     Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
-    Properties metaData = new Properties();
+    ContentProperties metaData = new ContentProperties();
     metaData.putAll(content.getMetadata()); // copy through
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);

Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Thu Dec 15 10:49:12 2005
@@ -31,6 +31,7 @@
 import org.apache.html.dom.*;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.*;
 import org.apache.nutch.parse.*;
 
@@ -106,19 +107,14 @@
     String text = "";
     String title = "";
     Outlink[] outlinks = new Outlink[0];
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
 
-    // check that contentType is one we can handle
-    String contentType = content.getContentType();
-    if (!"".equals(contentType) && !contentType.startsWith("text/html"))
-      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
-              "Content-Type not text/html: " + contentType).getEmptyParse();
-    
     // parse the content
     DocumentFragment root;
     try {
       byte[] contentInOctets = content.getContent();
       InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
+      String contentType = content.getMetadata().getProperty("Content-Type");
       String encoding = StringUtil.parseCharacterEncoding(contentType);
       if (encoding!=null) {
         metadata.put("OriginalCharEncoding", encoding);
@@ -271,7 +267,7 @@
     in.readFully(bytes);
     Parse parse = new HtmlParser().getParse(new Content(url,url,
                                                         bytes,"text/html",
-                                                        new Properties()));
+                                                        new ContentProperties()));
     System.out.println("data: "+parse.getData());
 
     System.out.println("text: "+parse.getText());

Modified: lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Thu Dec 15 10:49:12 2005
@@ -22,6 +22,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.Pattern;
@@ -56,7 +57,7 @@
     walk(doc, parse, metaTags, url, outlinks);
     if (outlinks.size() > 0) {
       Outlink[] old = parse.getData().getOutlinks();
-      Properties metadata = parse.getData().getMetadata();
+      ContentProperties metadata = parse.getData().getMetadata();
       String title = parse.getData().getTitle();
       List list = Arrays.asList(old);
       outlinks.addAll(list);
@@ -136,7 +137,7 @@
       idx = Math.min(MAX_TITLE_LEN, script.length());
       title = script.substring(0, idx);
     }
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(c.getMetadata());
     ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
             outlinks, metadata);

Modified: lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Thu Dec 15 10:49:12 2005
@@ -30,6 +30,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 
 /**
@@ -73,7 +74,7 @@
 
     byte[] raw = getRawBytes(new File(file));
 
-    Properties prop = new Properties();
+    ContentProperties prop = new ContentProperties();
     prop.setProperty("Content-Length", "" + raw.length);
 
     Content content = new Content(file, file, raw, MIME_TYPE, prop);
@@ -88,15 +89,6 @@
    */
   public Parse getParse(final Content content) {
 
-    // check that contentType is one we can handle
-    final String contentType = content.getContentType();
-
-    if (contentType != null && !contentType.startsWith(MIME_TYPE)) {
-      return new ParseStatus(ParseStatus.FAILED,
-          ParseStatus.FAILED_INVALID_FORMAT, "Content-Type is not ["
-              + MIME_TYPE + "] was: " + contentType).getEmptyParse();
-    }
-
     String plainText = null;
     String title = null;
     Outlink[] outlinks = null;
@@ -130,7 +122,7 @@
     }
 
     // collect meta data
-    final Properties metadata = new Properties();
+    final ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     if (properties != null) {
@@ -169,4 +161,4 @@
     }
 
   }
-}
\ No newline at end of file
+}

Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Thu Dec 15 10:49:12 2005
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse.msword;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
@@ -56,12 +57,6 @@
 
   public Parse getParse(Content content) {
 
-    // check that contentType is one we can handle
-    String contentType = content.getContentType();
-    if (contentType != null && !contentType.startsWith("application/msword"))
-      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
-        "Content-Type not application/msword: " + contentType).getEmptyParse();
-
     String text = null;
     String title = null;
     Properties properties = null;
@@ -102,7 +97,7 @@
     }
 
     // collect meta data
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     if(properties != null) {

Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Thu Dec 15 10:49:12 2005
@@ -26,6 +26,7 @@
 import org.pdfbox.exceptions.InvalidPasswordException;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
@@ -83,12 +84,6 @@
 
   public Parse getParse(Content content) {
 
-    // check that contentType is one we can handle
-    String contentType = content.getContentType();
-    if (contentType != null && !contentType.startsWith("application/pdf"))
-      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
-        "Content-Type not application/pdf: " + contentType).getEmptyParse();
-
     // in memory representation of pdf file
     PDDocument pdf = null;
 
@@ -165,7 +160,7 @@
     Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);

Modified: lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Thu Dec 15 10:49:12 2005
@@ -101,15 +101,6 @@
      */
     public Parse getParse(Content content) {
 
-        // check that contentType is one we can handle
-        String contentType = content.getContentType();
-        if (contentType != null
-                && (!contentType.startsWith("text/xml") && !contentType
-                        .startsWith("application/rss+xml")))
-            return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
-                    "Content-Type not text/xml or application/rss+xml: "
-                            + contentType).getEmptyParse();
-
         List theRSSChannels = null;
 
         try {

Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Thu Dec 15 10:49:12 2005
@@ -19,13 +19,14 @@
 import java.util.Properties;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.util.*;
 
 public class TextParser implements Parser {
   public Parse getParse(Content content) {
     // copy content meta data through
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata());
 
     //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);

Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Thu Dec 15 10:49:12 2005
@@ -31,6 +31,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 
 /**
@@ -47,13 +48,6 @@
     
     public Parse getParse(final Content content) {
         
-        // check that contentType is one we can handle
-        final String contentType = content.getContentType();
-        if (contentType != null && !contentType.startsWith("application/zip")) {
-            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
-              "Content-Type not application/zip: " + contentType).getEmptyParse();
-        }
-        
         String resultText = null;
         String resultTitle = null;
         Outlink[] outlinks = null;
@@ -87,7 +81,7 @@
         }
         
         // collect meta data
-        final Properties metadata = new Properties();
+        final ContentProperties metadata = new ContentProperties();
         metadata.putAll(content.getMetadata()); // copy through
         
         if (resultText == null) {

Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Thu Dec 15 10:49:12 2005
@@ -33,6 +33,7 @@
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.mime.MimeTypes;
@@ -84,7 +85,7 @@
           // Trying to resolve the Mime-Type
           String contentType = MIME.getMimeType(fname).getName();
           try {
-            Properties metadata = new Properties();
+            ContentProperties metadata = new ContentProperties();
             metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
             metadata.setProperty("Content-Type", contentType);
             Content content = new Content(newurl, base, b, contentType, metadata);

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Thu Dec 15 10:49:12 2005
@@ -25,9 +25,7 @@
 
 // Nutch imports
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.protocol.ContentProperties;
 
 
 /************************************
@@ -58,20 +56,11 @@
  ***********************************/
 public class FileResponse {
 
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
-
   private String orig;
   private String base;
   private byte[] content;
   private int code;
-  private Properties headers = new Properties();
+  private ContentProperties headers = new ContentProperties();
 
   private final File file;
 
@@ -201,15 +190,8 @@
     hdrs.put("Last-Modified",
       this.file.httpDateFormat.toString(f.lastModified()));
 
-    MimeType contentType = null;
-    if (MAGIC) {
-      contentType = MIME.getMimeType(f.getName(), this.content);
-    } else {
-      contentType = MIME.getMimeType(f.getName());
-    }
-    if (contentType != null) {
-        hdrs.put("Content-Type", contentType.getName());
-    }
+    hdrs.put("Content-Type", "");   // No Content-Type at file protocol level
+
     this.headers.putAll(hdrs);
 
     // response code

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Thu Dec 15 10:49:12 2005
@@ -25,10 +25,7 @@
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
 
 import org.apache.nutch.protocol.Content;
-
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.protocol.ContentProperties;
 
 import java.net.InetAddress;
 import java.net.URL;
@@ -58,20 +55,12 @@
  * @author John Xing
  ***********************************/
 public class FtpResponse {
-    
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-    
+
   private String orig;
   private String base;
   private byte[] content;
   private int code;
-  private Properties headers = new Properties();
+  private ContentProperties headers = new ContentProperties();
 
   private final Ftp ftp;
 
@@ -314,16 +303,6 @@
         ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
       this.content = os.toByteArray();
 
-      MimeType contentType = null;
-      if (MAGIC) {
-        contentType = MIME.getMimeType(path, this.content);
-      } else {
-        contentType = MIME.getMimeType(path);
-      }
-      if (contentType != null) {
-        this.headers.put("Content-Type", contentType.getName());
-      }
-
 //      // approximate bytes sent and read
 //      if (this.httpAccounting != null) {
 //        this.httpAccounting.incrementBytesSent(path.length());
@@ -359,16 +338,6 @@
       this.headers.put("Last-Modified",
         ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
       this.content = os.toByteArray();
-
-      MimeType contentType = null;
-      if (MAGIC) {
-        contentType = MIME.getMimeType(path, this.content);
-      } else {
-        contentType = MIME.getMimeType(path);
-      }
-      if (contentType != null) {
-        this.headers.put("Content-Type", contentType.getName());
-      }
 
 //      // approximate bytes sent and read
 //      if (this.httpAccounting != null) {

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Thu Dec 15 10:49:12 2005
@@ -32,31 +32,20 @@
 import java.util.logging.Level;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolException;
 
 import org.apache.nutch.util.GZIPUtils;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 
 
 /** An HTTP response. */
 public class HttpResponse {
-
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
   
   private String orig;
   private String base;
   private byte[] content;
   private int code;
-  private Properties headers = new Properties();
+  private ContentProperties headers = new ContentProperties();
 
   /** Returns the response code. */
   public int getCode() { return code; }
@@ -69,21 +58,9 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    String contentType = getHeader("Content-Type");
-    if (contentType == null) {
-      MimeType type = null;
-      if (MAGIC) {
-        type = MIME.getMimeType(orig, content);
-      } else {
-        type = MIME.getMimeType(orig);
-      }
-      if (type != null) {
-          contentType = type.getName();
-      } else {
-          contentType = "";
-      }
-    }
-    return new Content(orig, base, content, contentType, headers);
+    return new Content(orig, base, content,
+                       getHeader("Content-Type"),
+                       headers);
   }
 
   public HttpResponse(URL url) throws ProtocolException, IOException {

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Thu Dec 15 10:49:12 2005
@@ -11,6 +11,8 @@
 import java.util.TreeMap;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
@@ -44,7 +46,7 @@
     
     private HttpAuthenticationFactory() { }
     
-    public static HttpAuthentication findAuthentication(Properties header) {
+    public static HttpAuthentication findAuthentication(ContentProperties header) {
         if (header == null) return null;
         
     	try {

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu Dec 15 10:49:12 2005
@@ -4,9 +4,6 @@
 package org.apache.nutch.protocol.httpclient;
 
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpVersion;
@@ -24,13 +21,6 @@
  * An HTTP response.
  */
 public class HttpResponse {
-  /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
-
-  /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
 
   private String orig;
 
@@ -63,22 +53,10 @@
   }
 
   public Content toContent() {
-    String contentType = getHeader("Content-Type");
-    if (contentType == null) {
-      MimeType type = null;
-      if (MAGIC) {
-        type = MIME.getMimeType(orig, content);
-      } else {
-        type = MIME.getMimeType(orig);
-      }
-      if (type != null) {
-          contentType = type.getName();
-      } else {
-          contentType = "";
-      }
-    }
-    if (content == null) content = EMPTY_CONTENT;
-    return new Content(orig, base, content, contentType, headers);
+    return new Content(orig, base,
+                       (content == null ? EMPTY_CONTENT : content),
+                       getHeader("Content-Type"),
+                       headers);
   }
 
   public HttpResponse(URL url) throws IOException {

Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java Thu Dec 15 10:49:12 2005
@@ -10,17 +10,18 @@
 import java.util.Iterator;
 import java.util.logging.Logger;
 
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 
 /**
- * An extension to {@link Properties} which allows multiple values for a single key.
+ * An extension to {@link ContentProperties} which allows multiple values for a single key.
  * The {@link #get(Object)} method may return a single value or a
  * {@link java.util.Collection} of values.
  *
  * @author Matt Tencati
  */
 
-public class MultiProperties extends Properties {
+public class MultiProperties extends ContentProperties {
     public static final Logger LOG = LogFormatter
             .getLogger("net.nutch.protocol.http.MultiProperties");
 
@@ -31,7 +32,7 @@
      */
     public MultiProperties() {
         super();
-        multiMap = new TreeMap();
+        multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
     }
 
     /**
@@ -41,7 +42,7 @@
      */
     public MultiProperties(Properties defaults) {
         super(defaults);
-        multiMap = new TreeMap();
+        multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
     }
 
     /** 

Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java Thu Dec 15 10:49:12 2005
@@ -16,10 +16,8 @@
 
 package org.apache.nutch.parse;
 
-import java.io.*;
-import java.util.Properties;
 import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
+import org.apache.nutch.protocol.ContentProperties;
 import junit.framework.TestCase;
 
 /** Unit tests for ParseData. */
@@ -36,7 +34,7 @@
       new Outlink("http://bar.com/", "Bar")
     };
 
-    Properties metaData = new Properties();
+    ContentProperties metaData = new ContentProperties();
     metaData.put("Language", "en/us");
     metaData.put("Charset", "UTF-8");
 

Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java Thu Dec 15 10:49:12 2005
@@ -16,10 +16,7 @@
 
 package org.apache.nutch.protocol;
 
-import java.io.*;
-import java.util.Properties;
 import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
 import junit.framework.TestCase;
 
 /** Unit tests for Content. */
@@ -33,7 +30,7 @@
 
     String url = "http://www.foo.com/";
 
-    Properties metaData = new Properties();
+    ContentProperties metaData = new ContentProperties();
     metaData.put("Host", "www.foo.com");
     metaData.put("Content-Type", "text/html");
 
@@ -41,6 +38,62 @@
                             metaData);
                         
     TestWritable.testWritable(r);
+    assertEquals("text/html", r.getMetadata().get("Content-Type"));
+    assertEquals("text/html", r.getMetadata().get("content-type"));
+  }
+
+  /** Unit tests for getContentType(String, String, byte[]) method. */
+  public void testGetContentType() throws Exception {
+    Content c = null;
+    ContentProperties p = new ContentProperties();
+
+    c = new Content("http://www.foo.com/",
+                    "http://www.foo.com/",
+                    "".getBytes("UTF8"),
+                    "text/html; charset=UTF-8", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html",
+                    "http://www.foo.com/",
+                    "".getBytes("UTF8"),
+                    "", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html",
+                    "http://www.foo.com/",
+                    "".getBytes("UTF8"),
+                    null, p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/",
+                    "http://www.foo.com/",
+                    "<html></html>".getBytes("UTF8"),
+                    "", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html",
+                    "http://www.foo.com/",
+                    "<html></html>".getBytes("UTF8"),
+                    "text/plain", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.png",
+                    "http://www.foo.com/",
+                    "<html></html>".getBytes("UTF8"),
+                    "text/plain", p);
+    assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/",
+                    "http://www.foo.com/",
+                    "".getBytes("UTF8"),
+                    "", p);
+    assertEquals("", c.getContentType());
+
+    c = new Content("http://www.foo.com/",
+                    "http://www.foo.com/",
+                    "".getBytes("UTF8"),
+                    null, p);
+    assertNull(c.getContentType());
   }
 	
 }

Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java Thu Dec 15 10:49:12 2005
@@ -32,6 +32,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolStatus;
 
 import junit.framework.TestCase;
@@ -90,7 +91,7 @@
         content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n");
       }
       content.append("</body></html>");
-      Properties meta = new Properties();
+      ContentProperties meta = new ContentProperties();
       meta.setProperty("Content-Type", "text/html");
       meta.setProperty("Host", "http://localhost");
       meta.setProperty("Connection", "Keep-alive, close");

Modified: lucene/nutch/branches/mapred/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/web/jsp/cached.jsp?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/branches/mapred/src/web/jsp/cached.jsp Thu Dec 15 10:49:12 2005
@@ -7,6 +7,7 @@
 
   import="org.apache.nutch.searcher.*"
   import="org.apache.nutch.parse.ParseData"
+  import="org.apache.nutch.protocol.ContentProperties"
 %><%
   NutchBean bean = NutchBean.get(application);
   bean.LOG.info("cache request from " + request.getRemoteAddr());
@@ -19,7 +20,7 @@
     ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
     .getLocale().getLanguage();
 
-  Properties metaData = bean.getParseData(details).getMetadata();
+  ContentProperties metaData = bean.getParseData(details).getMetadata();
 
   String content = null;
   String contentType = (String) metaData.get("Content-Type");