You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/12/11 01:37:50 UTC

svn commit: r355828 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/ java/org/apache/nutch/servlet/ java/org/apache/nutch/tools/ plugin/creativecommons/src/java/org/creativecommon...

Author: jerome
Date: Sat Dec 10 16:36:57 2005
New Revision: 355828

URL: http://svn.apache.org/viewcvs?rev=355828&view=rev
Log:
NUTCH-135 : Content metadata are now case insensitive (thanks to S. Groschupf)

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
    lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
    lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
    lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
    lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
    lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
    lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
    lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
    lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
    lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
    lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
    lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
    lucene/nutch/trunk/src/web/jsp/cached.jsp

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Dec 10 16:36:57 2005
@@ -234,7 +234,7 @@
       MD5Hash hash = null;
       String url = fle.getPage().getURL().toString();
       if (content == null) {
-        content = new Content(url, url, new byte[0], "", new Properties());
+        content = new Content(url, url, new byte[0], "", new ContentProperties());
         hash = MD5Hash.digest(url);
       } else {
         hash = MD5Hash.digest(content.getContent());
@@ -263,7 +263,7 @@
                 + status.toString());
         outputPage(new FetcherOutput(fle, hash, protocolStatus),
                 content, new ParseText(""),
-                new ParseData(status, "", new Outlink[0], new Properties()));
+                new ParseData(status, "", new Outlink[0], new ContentProperties()));
       }
       return status;
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Sat Dec 10 16:36:57 2005
@@ -21,6 +21,7 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 
 
@@ -34,12 +35,12 @@
 
   private String title;
   private Outlink[] outlinks;
-  private Properties metadata;
+  private ContentProperties metadata;
   private ParseStatus status;
 
   public ParseData() {}
 
-  public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) {
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks, ContentProperties metadata) {
     this.status = status;
     this.title = title;
     this.outlinks = outlinks;
@@ -62,7 +63,7 @@
   /** Other page properties.  This is the place to find format-specific
    * properties.  Different parser implementations for different content types
    * will populate this differently. */
-  public Properties getMetadata() { return metadata; }
+  public ContentProperties getMetadata() { return metadata; }
 
   /** Return the value of a metadata property. */
   public String get(String name) { return getMetadata().getProperty(name); }
@@ -94,7 +95,7 @@
     }
     
     int propertyCount = in.readInt();             // read metadata
-    metadata = new Properties();
+    metadata = new ContentProperties();
     for (int i = 0; i < propertyCount; i++) {
       metadata.put(UTF8.readString(in), UTF8.readString(in));
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Sat Dec 10 16:36:57 2005
@@ -12,6 +12,7 @@
 
 import org.apache.nutch.io.VersionedWritable;
 import org.apache.nutch.io.WritableUtils;
+import org.apache.nutch.protocol.ContentProperties;
 
 /**
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
@@ -230,7 +231,7 @@
     private ParseData data = null;
     
     public EmptyParseImpl(ParseStatus status) {
-      data = new ParseData(status, "", new Outlink[0], new Properties());
+      data = new ParseData(status, "", new Outlink[0], new ContentProperties());
     }
     
     public ParseData getData() {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 10 16:36:57 2005
@@ -45,12 +45,12 @@
   private String base;
   private byte[] content;
   private String contentType;
-  private Properties metadata;
+  private ContentProperties metadata;
 
   public Content() {}
     
   public Content(String url, String base, byte[] content, String contentType,
-                 Properties metadata) {
+                 ContentProperties metadata) {
 
     if (url == null) throw new IllegalArgumentException("null url");
     if (base == null) throw new IllegalArgumentException("null base");
@@ -77,7 +77,7 @@
     contentType = UTF8.readString(in);            // read contentType
 
     int propertyCount = in.readInt();             // read metadata
-    metadata = new Properties();
+    metadata = new ContentProperties();
     for (int i = 0; i < propertyCount; i++) {
       metadata.put(UTF8.readString(in), UTF8.readString(in));
     }
@@ -134,7 +134,7 @@
   }
 
   /** Other protocol-specific data. */
-  public Properties getMetadata() { return metadata; }
+  public ContentProperties getMetadata() { return metadata; }
 
   /** Return the value of a metadata property. */
   public String get(String name) { return getMetadata().getProperty(name); }

Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=355828&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Sat Dec 10 16:36:57 2005
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.Properties;
+import java.util.TreeMap;
+
+/**
+ * case insensitive properties
+ */
+public class ContentProperties extends TreeMap {
+
+    /**
+     * construct the TreeMap with a case insensitive comparator
+     */
+    public ContentProperties() {
+        super(String.CASE_INSENSITIVE_ORDER);
+    }
+
+    /**
+     * initialize with default values
+     * 
+     * @param defaults
+     */
+    public ContentProperties(Properties defaults) {
+        super(String.CASE_INSENSITIVE_ORDER);
+        putAll(defaults);
+    }
+
+    /**
+     * @param key
+     * @return the property value or null
+     */
+    public String getProperty(String key) {
+        return (String) get(key);
+    }
+
+    /**
+     * sets the key value tuple
+     * 
+     * @param key
+     * @param value
+     */
+    public void setProperty(String key, String value) {
+        put(key, value);
+
+    }
+
+    public Enumeration propertyNames() {
+        return new KeyEnumeration(keySet().iterator());
+    }
+
+    class KeyEnumeration implements Enumeration {
+
+        private Iterator fIterator;
+
+        public KeyEnumeration(Iterator iterator) {
+            fIterator = iterator;
+        }
+
+        public boolean hasMoreElements() {
+            return fIterator.hasNext();
+
+        }
+
+        public Object nextElement() {
+            return fIterator.next();
+        }
+
+    }
+
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Sat Dec 10 16:36:57 2005
@@ -16,6 +16,7 @@
 
 package org.apache.nutch.servlet;
 
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.searcher.NutchBean;
 import org.apache.nutch.searcher.Hit;
 import org.apache.nutch.searcher.HitDetails;
@@ -76,7 +77,7 @@
     byte[] bytes = bean.getContent(details);
 
     // pass all original headers? only these for now.
-    Properties metaData = bean.getParseData(details).getMetadata();
+    ContentProperties metaData = bean.getParseData(details).getMetadata();
     String contentType = (String) metaData.get("Content-Type");
     //String lastModified = (String) metaData.get("Last-Modified");
     //String contentLength = (String) metaData.get("Content-Length");

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java Sat Dec 10 16:36:57 2005
@@ -240,7 +240,7 @@
         }
         outputPage(new ParseText(""),
                 new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
-                        "", new Outlink[0], new Properties()));
+                        "", new Outlink[0], new ContentProperties()));
       }
     }
 
@@ -250,7 +250,7 @@
         return;
       }
       outputPage(new ParseText(""),
-                 new ParseData(status, "", new Outlink[0], new Properties()));
+                 new ParseData(status, "", new Outlink[0], new ContentProperties()));
     }
       
     private void outputPage

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Sat Dec 10 16:36:57 2005
@@ -18,6 +18,7 @@
 
 import org.apache.nutch.parse.*;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.NutchConf;
 
 import java.util.*;
@@ -51,7 +52,7 @@
     }
 
     /** Scan the document adding attributes to metadata.*/
-    public static void walk(Node doc, URL base, Properties metadata)
+    public static void walk(Node doc, URL base, ContentProperties metadata)
       throws ParseException {
 
       // walk the DOM tree, scanning for license data

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Sat Dec 10 16:36:57 2005
@@ -19,6 +19,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 import java.util.Properties;
 import java.io.*;
@@ -56,10 +57,10 @@
     byte[] bytes = out.toByteArray();
 
     Content content =
-      new Content(url, url, bytes, contentType, new Properties());
+      new Content(url, url, bytes, contentType, new ContentProperties());
     Parse parse = ParseUtil.parseByParserId("parse-html",content);
 
-    Properties metadata = parse.getData().getMetadata();
+    ContentProperties metadata = parse.getData().getMetadata();
     assertEquals(license, metadata.get("License-Url"));
     assertEquals(location, metadata.get("License-Location"));
     assertEquals(type, metadata.get("Work-Type"));

Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sat Dec 10 16:36:57 2005
@@ -29,6 +29,7 @@
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.ContentProperties;
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
@@ -86,7 +87,7 @@
     String url = fo.getUrl().toString();
 
     // normalize metaData (see note in the method below).
-    Properties metaData = normalizeMeta(parse.getData().getMetadata());
+    ContentProperties metaData = normalizeMeta(parse.getData().getMetadata());
 
     addTime(doc, metaData, url, fo);
 
@@ -101,7 +102,7 @@
     
   // Add time related meta info.  Add last-modified if present.  Index date as
   // last-modified, or, if that's not present, use fetch time.
-  private Document addTime(Document doc, Properties metaData, String url,
+  private Document addTime(Document doc, ContentProperties metaData, String url,
                            FetcherOutput fo) {
     long time = -1;
 
@@ -169,7 +170,7 @@
   }
 
   // Add Content-Length
-  private Document addLength(Document doc, Properties metaData, String url) {
+  private Document addLength(Document doc, ContentProperties metaData, String url) {
     String contentLength = metaData.getProperty("content-length");
 
     if (contentLength != null)
@@ -179,7 +180,7 @@
   }
 
   // Add Content-Type and its primaryType and subType
-  private Document addType(Document doc, Properties metaData, String url) {
+  private Document addType(Document doc, ContentProperties metaData, String url) {
     MimeType mimeType = null;
     String contentType = metaData.getProperty("content-type");
     if (contentType == null) {
@@ -259,7 +260,7 @@
     }
   }
 
-  private Document resetTitle(Document doc, Properties metaData, String url) {
+  private Document resetTitle(Document doc, ContentProperties metaData, String url) {
     String contentDisposition = metaData.getProperty("content-disposition");
     if (contentDisposition == null)
       return doc;
@@ -284,8 +285,8 @@
   // (*) empty header value
   // Note: the original metaData should be kept intact,
   // because there is a benefit to preserve whatever comes from server.
-  private Properties normalizeMeta(Properties old) {
-    Properties normalized = new Properties();
+  private ContentProperties normalizeMeta(ContentProperties old) {
+      ContentProperties normalized = new ContentProperties();
 
     for (Enumeration e = old.propertyNames(); e.hasMoreElements();) {
       String key = (String) e.nextElement();

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Sat Dec 10 16:36:57 2005
@@ -15,8 +15,7 @@
  */
 package org.apache.nutch.analysis.lang;
 
-// JDK imports
-import java.util.Properties;
+
 
 // JUnit imports
 import junit.framework.TestCase;
@@ -26,6 +25,7 @@
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 
 public class TestHTMLLanguageParser extends TestCase {
@@ -122,7 +122,7 @@
   
   
   private Content getContent(String text) {
-    Properties p = new Properties();
+    ContentProperties p = new ContentProperties();
     p.put("Content-Type", "text/html");
 
     Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);

Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Sat Dec 10 16:36:57 2005
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse.ext;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
@@ -155,7 +156,7 @@
     Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
-    Properties metaData = new Properties();
+    ContentProperties metaData = new ContentProperties();
     metaData.putAll(content.getMetadata()); // copy through
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Sat Dec 10 16:36:57 2005
@@ -31,6 +31,7 @@
 import org.apache.html.dom.*;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.*;
 import org.apache.nutch.parse.*;
 
@@ -106,7 +107,7 @@
     String text = "";
     String title = "";
     Outlink[] outlinks = new Outlink[0];
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
 
     // check that contentType is one we can handle
     String contentType = content.getContentType();
@@ -271,7 +272,7 @@
     in.readFully(bytes);
     Parse parse = new HtmlParser().getParse(new Content(url,url,
                                                         bytes,"text/html",
-                                                        new Properties()));
+                                                        new ContentProperties()));
     System.out.println("data: "+parse.getData());
 
     System.out.println("text: "+parse.getText());

Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Dec 10 16:36:57 2005
@@ -22,6 +22,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.Pattern;
@@ -56,7 +57,7 @@
     walk(doc, parse, metaTags, url, outlinks);
     if (outlinks.size() > 0) {
       Outlink[] old = parse.getData().getOutlinks();
-      Properties metadata = parse.getData().getMetadata();
+      ContentProperties metadata = parse.getData().getMetadata();
       String title = parse.getData().getTitle();
       List list = Arrays.asList(old);
       outlinks.addAll(list);
@@ -136,7 +137,7 @@
       idx = Math.min(MAX_TITLE_LEN, script.length());
       title = script.substring(0, idx);
     }
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(c.getMetadata());
     ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
             outlinks, metadata);

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Sat Dec 10 16:36:57 2005
@@ -30,6 +30,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 
 /**
@@ -73,7 +74,7 @@
 
     byte[] raw = getRawBytes(new File(file));
 
-    Properties prop = new Properties();
+    ContentProperties prop = new ContentProperties();
     prop.setProperty("Content-Length", "" + raw.length);
 
     Content content = new Content(file, file, raw, MIME_TYPE, prop);
@@ -130,7 +131,7 @@
     }
 
     // collect meta data
-    final Properties metadata = new Properties();
+    final ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     if (properties != null) {

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Sat Dec 10 16:36:57 2005
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse.msword;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
@@ -102,7 +103,7 @@
     }
 
     // collect meta data
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     if(properties != null) {

Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Sat Dec 10 16:36:57 2005
@@ -26,6 +26,7 @@
 import org.pdfbox.exceptions.InvalidPasswordException;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
@@ -165,7 +166,7 @@
     Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
 
     // collect meta data
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata()); // copy through
 
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);

Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Sat Dec 10 16:36:57 2005
@@ -19,13 +19,14 @@
 import java.util.Properties;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.util.*;
 
 public class TextParser implements Parser {
   public Parse getParse(Content content) {
     // copy content meta data through
-    Properties metadata = new Properties();
+    ContentProperties metadata = new ContentProperties();
     metadata.putAll(content.getMetadata());
 
     //ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Sat Dec 10 16:36:57 2005
@@ -31,6 +31,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 
 /**
@@ -87,7 +88,7 @@
         }
         
         // collect meta data
-        final Properties metadata = new Properties();
+        final ContentProperties metadata = new ContentProperties();
         metadata.putAll(content.getMetadata()); // copy through
         
         if (resultText == null) {

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Sat Dec 10 16:36:57 2005
@@ -33,6 +33,7 @@
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.mime.MimeTypes;
@@ -84,7 +85,7 @@
           // Trying to resolve the Mime-Type
           String contentType = MIME.getMimeType(fname).getName();
           try {
-            Properties metadata = new Properties();
+            ContentProperties metadata = new ContentProperties();
             metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
             metadata.setProperty("Content-Type", contentType);
             Content content = new Content(newurl, base, b, contentType, metadata);

Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Sat Dec 10 16:36:57 2005
@@ -25,6 +25,7 @@
 
 // Nutch imports
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 
 /************************************
@@ -59,7 +60,7 @@
   private String base;
   private byte[] content;
   private int code;
-  private Properties headers = new Properties();
+  private ContentProperties headers = new ContentProperties();
 
   private final File file;
 

Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Sat Dec 10 16:36:57 2005
@@ -25,6 +25,7 @@
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 
 import java.net.InetAddress;
 import java.net.URL;
@@ -59,7 +60,7 @@
   private String base;
   private byte[] content;
   private int code;
-  private Properties headers = new Properties();
+  private ContentProperties headers = new ContentProperties();
 
   private final Ftp ftp;
 

Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Sat Dec 10 16:36:57 2005
@@ -32,6 +32,7 @@
 import java.util.logging.Level;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolException;
 
 import org.apache.nutch.util.GZIPUtils;
@@ -44,7 +45,7 @@
   private String base;
   private byte[] content;
   private int code;
-  private Properties headers = new Properties();
+  private ContentProperties headers = new ContentProperties();
 
   /** Returns the response code. */
   public int getCode() { return code; }

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Sat Dec 10 16:36:57 2005
@@ -11,6 +11,8 @@
 import java.util.TreeMap;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
@@ -44,7 +46,7 @@
     
     private HttpAuthenticationFactory() { }
     
-    public static HttpAuthentication findAuthentication(Properties header) {
+    public static HttpAuthentication findAuthentication(ContentProperties header) {
         if (header == null) return null;
         
     	try {

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java Sat Dec 10 16:36:57 2005
@@ -10,17 +10,18 @@
 import java.util.Iterator;
 import java.util.logging.Logger;
 
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.util.LogFormatter;
 
 /**
- * An extension to {@link Properties} which allows multiple values for a single key.
+ * An extension to {@link ContentProperties} which allows multiple values for a single key.
  * The {@link #get(Object)} method may return a single value or a
  * {@link java.util.Collection} of values.
  *
  * @author Matt Tencati
  */
 
-public class MultiProperties extends Properties {
+public class MultiProperties extends ContentProperties {
     public static final Logger LOG = LogFormatter
             .getLogger("net.nutch.protocol.http.MultiProperties");
 
@@ -31,7 +32,7 @@
      */
     public MultiProperties() {
         super();
-        multiMap = new TreeMap();
+        multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
     }
 
     /**
@@ -41,7 +42,7 @@
      */
     public MultiProperties(Properties defaults) {
         super(defaults);
-        multiMap = new TreeMap();
+        multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
     }
 
     /** 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Sat Dec 10 16:36:57 2005
@@ -16,10 +16,8 @@
 
 package org.apache.nutch.parse;
 
-import java.io.*;
-import java.util.Properties;
 import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
+import org.apache.nutch.protocol.ContentProperties;
 import junit.framework.TestCase;
 
 /** Unit tests for ParseData. */
@@ -36,7 +34,7 @@
       new Outlink("http://bar.com/", "Bar")
     };
 
-    Properties metaData = new Properties();
+    ContentProperties metaData = new ContentProperties();
     metaData.put("Language", "en/us");
     metaData.put("Charset", "UTF-8");
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Sat Dec 10 16:36:57 2005
@@ -16,10 +16,7 @@
 
 package org.apache.nutch.protocol;
 
-import java.io.*;
-import java.util.Properties;
 import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
 import junit.framework.TestCase;
 
 /** Unit tests for Content. */
@@ -33,7 +30,7 @@
 
     String url = "http://www.foo.com/";
 
-    Properties metaData = new Properties();
+    ContentProperties metaData = new ContentProperties();
     metaData.put("Host", "www.foo.com");
     metaData.put("Content-Type", "text/html");
 
@@ -41,12 +38,14 @@
                             metaData);
                         
     TestWritable.testWritable(r);
+    assertEquals("text/html", r.getMetadata().get("Content-Type"));
+    assertEquals("text/html", r.getMetadata().get("content-type"));
   }
 
   /** Unit tests for getContentType(String, String, byte[]) method. */
   public void testGetContentType() throws Exception {
     Content c = null;
-    Properties p = new Properties();
+    ContentProperties p = new ContentProperties();
 
     c = new Content("http://www.foo.com/",
                     "http://www.foo.com/",

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java Sat Dec 10 16:36:57 2005
@@ -32,6 +32,7 @@
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolStatus;
 
 import junit.framework.TestCase;
@@ -90,7 +91,7 @@
         content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n");
       }
       content.append("</body></html>");
-      Properties meta = new Properties();
+      ContentProperties meta = new ContentProperties();
       meta.setProperty("Content-Type", "text/html");
       meta.setProperty("Host", "http://localhost");
       meta.setProperty("Connection", "Keep-alive, close");

Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cached.jsp Sat Dec 10 16:36:57 2005
@@ -7,6 +7,7 @@
 
   import="org.apache.nutch.searcher.*"
   import="org.apache.nutch.parse.ParseData"
+  import="org.apache.nutch.protocol.ContentProperties"
 %><%
   NutchBean bean = NutchBean.get(application);
   bean.LOG.info("cache request from " + request.getRemoteAddr());
@@ -19,7 +20,7 @@
     ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
     .getLocale().getLanguage();
 
-  Properties metaData = bean.getParseData(details).getMetadata();
+  ContentProperties metaData = bean.getParseData(details).getMetadata();
 
   String content = null;
   String contentType = (String) metaData.get("Content-Type");