You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/12/11 01:37:50 UTC
svn commit: r355828 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/
java/org/apache/nutch/protocol/ java/org/apache/nutch/servlet/
java/org/apache/nutch/tools/
plugin/creativecommons/src/java/org/creativecommon...
Author: jerome
Date: Sat Dec 10 16:36:57 2005
New Revision: 355828
URL: http://svn.apache.org/viewcvs?rev=355828&view=rev
Log:
NUTCH-135 : Content metadata are now case insensitive (thanks to S. Groschupf)
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
lucene/nutch/trunk/src/web/jsp/cached.jsp
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Dec 10 16:36:57 2005
@@ -234,7 +234,7 @@
MD5Hash hash = null;
String url = fle.getPage().getURL().toString();
if (content == null) {
- content = new Content(url, url, new byte[0], "", new Properties());
+ content = new Content(url, url, new byte[0], "", new ContentProperties());
hash = MD5Hash.digest(url);
} else {
hash = MD5Hash.digest(content.getContent());
@@ -263,7 +263,7 @@
+ status.toString());
outputPage(new FetcherOutput(fle, hash, protocolStatus),
content, new ParseText(""),
- new ParseData(status, "", new Outlink[0], new Properties()));
+ new ParseData(status, "", new Outlink[0], new ContentProperties()));
}
return status;
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Sat Dec 10 16:36:57 2005
@@ -21,6 +21,7 @@
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.tools.UpdateDatabaseTool;
@@ -34,12 +35,12 @@
private String title;
private Outlink[] outlinks;
- private Properties metadata;
+ private ContentProperties metadata;
private ParseStatus status;
public ParseData() {}
- public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) {
+ public ParseData(ParseStatus status, String title, Outlink[] outlinks, ContentProperties metadata) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
@@ -62,7 +63,7 @@
/** Other page properties. This is the place to find format-specific
* properties. Different parser implementations for different content types
* will populate this differently. */
- public Properties getMetadata() { return metadata; }
+ public ContentProperties getMetadata() { return metadata; }
/** Return the value of a metadata property. */
public String get(String name) { return getMetadata().getProperty(name); }
@@ -94,7 +95,7 @@
}
int propertyCount = in.readInt(); // read metadata
- metadata = new Properties();
+ metadata = new ContentProperties();
for (int i = 0; i < propertyCount; i++) {
metadata.put(UTF8.readString(in), UTF8.readString(in));
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Sat Dec 10 16:36:57 2005
@@ -12,6 +12,7 @@
import org.apache.nutch.io.VersionedWritable;
import org.apache.nutch.io.WritableUtils;
+import org.apache.nutch.protocol.ContentProperties;
/**
* @author Andrzej Bialecki <ab@getopt.org>
@@ -230,7 +231,7 @@
private ParseData data = null;
public EmptyParseImpl(ParseStatus status) {
- data = new ParseData(status, "", new Outlink[0], new Properties());
+ data = new ParseData(status, "", new Outlink[0], new ContentProperties());
}
public ParseData getData() {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sat Dec 10 16:36:57 2005
@@ -45,12 +45,12 @@
private String base;
private byte[] content;
private String contentType;
- private Properties metadata;
+ private ContentProperties metadata;
public Content() {}
public Content(String url, String base, byte[] content, String contentType,
- Properties metadata) {
+ ContentProperties metadata) {
if (url == null) throw new IllegalArgumentException("null url");
if (base == null) throw new IllegalArgumentException("null base");
@@ -77,7 +77,7 @@
contentType = UTF8.readString(in); // read contentType
int propertyCount = in.readInt(); // read metadata
- metadata = new Properties();
+ metadata = new ContentProperties();
for (int i = 0; i < propertyCount; i++) {
metadata.put(UTF8.readString(in), UTF8.readString(in));
}
@@ -134,7 +134,7 @@
}
/** Other protocol-specific data. */
- public Properties getMetadata() { return metadata; }
+ public ContentProperties getMetadata() { return metadata; }
/** Return the value of a metadata property. */
public String get(String name) { return getMetadata().getProperty(name); }
Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java?rev=355828&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java Sat Dec 10 16:36:57 2005
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import java.util.Enumeration;
+import java.util.Iterator;
+import java.util.Properties;
+import java.util.TreeMap;
+
+/**
+ * case insensitive properties
+ */
+public class ContentProperties extends TreeMap {
+
+ /**
+ * construct the TreeMap with a case insensitive comparator
+ */
+ public ContentProperties() {
+ super(String.CASE_INSENSITIVE_ORDER);
+ }
+
+ /**
+ * initialize with default values
+ *
+ * @param defaults
+ */
+ public ContentProperties(Properties defaults) {
+ super(String.CASE_INSENSITIVE_ORDER);
+ putAll(defaults);
+ }
+
+ /**
+ * @param key
+ * @return the property value or null
+ */
+ public String getProperty(String key) {
+ return (String) get(key);
+ }
+
+ /**
+ * sets the key value tuple
+ *
+ * @param key
+ * @param value
+ */
+ public void setProperty(String key, String value) {
+ put(key, value);
+
+ }
+
+ public Enumeration propertyNames() {
+ return new KeyEnumeration(keySet().iterator());
+ }
+
+ class KeyEnumeration implements Enumeration {
+
+ private Iterator fIterator;
+
+ public KeyEnumeration(Iterator iterator) {
+ fIterator = iterator;
+ }
+
+ public boolean hasMoreElements() {
+ return fIterator.hasNext();
+
+ }
+
+ public Object nextElement() {
+ return fIterator.next();
+ }
+
+ }
+
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Sat Dec 10 16:36:57 2005
@@ -16,6 +16,7 @@
package org.apache.nutch.servlet;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.searcher.NutchBean;
import org.apache.nutch.searcher.Hit;
import org.apache.nutch.searcher.HitDetails;
@@ -76,7 +77,7 @@
byte[] bytes = bean.getContent(details);
// pass all original headers? only these for now.
- Properties metaData = bean.getParseData(details).getMetadata();
+ ContentProperties metaData = bean.getParseData(details).getMetadata();
String contentType = (String) metaData.get("Content-Type");
//String lastModified = (String) metaData.get("Last-Modified");
//String contentLength = (String) metaData.get("Content-Length");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java Sat Dec 10 16:36:57 2005
@@ -240,7 +240,7 @@
}
outputPage(new ParseText(""),
new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
- "", new Outlink[0], new Properties()));
+ "", new Outlink[0], new ContentProperties()));
}
}
@@ -250,7 +250,7 @@
return;
}
outputPage(new ParseText(""),
- new ParseData(status, "", new Outlink[0], new Properties()));
+ new ParseData(status, "", new Outlink[0], new ContentProperties()));
}
private void outputPage
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Sat Dec 10 16:36:57 2005
@@ -18,6 +18,7 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.NutchConf;
import java.util.*;
@@ -51,7 +52,7 @@
}
/** Scan the document adding attributes to metadata.*/
- public static void walk(Node doc, URL base, Properties metadata)
+ public static void walk(Node doc, URL base, ContentProperties metadata)
throws ParseException {
// walk the DOM tree, scanning for license data
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Sat Dec 10 16:36:57 2005
@@ -19,6 +19,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import java.util.Properties;
import java.io.*;
@@ -56,10 +57,10 @@
byte[] bytes = out.toByteArray();
Content content =
- new Content(url, url, bytes, contentType, new Properties());
+ new Content(url, url, bytes, contentType, new ContentProperties());
Parse parse = ParseUtil.parseByParserId("parse-html",content);
- Properties metadata = parse.getData().getMetadata();
+ ContentProperties metadata = parse.getData().getMetadata();
assertEquals(license, metadata.get("License-Url"));
assertEquals(location, metadata.get("License-Location"));
assertEquals(type, metadata.get("Work-Type"));
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sat Dec 10 16:36:57 2005
@@ -29,6 +29,7 @@
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
@@ -86,7 +87,7 @@
String url = fo.getUrl().toString();
// normalize metaData (see note in the method below).
- Properties metaData = normalizeMeta(parse.getData().getMetadata());
+ ContentProperties metaData = normalizeMeta(parse.getData().getMetadata());
addTime(doc, metaData, url, fo);
@@ -101,7 +102,7 @@
// Add time related meta info. Add last-modified if present. Index date as
// last-modified, or, if that's not present, use fetch time.
- private Document addTime(Document doc, Properties metaData, String url,
+ private Document addTime(Document doc, ContentProperties metaData, String url,
FetcherOutput fo) {
long time = -1;
@@ -169,7 +170,7 @@
}
// Add Content-Length
- private Document addLength(Document doc, Properties metaData, String url) {
+ private Document addLength(Document doc, ContentProperties metaData, String url) {
String contentLength = metaData.getProperty("content-length");
if (contentLength != null)
@@ -179,7 +180,7 @@
}
// Add Content-Type and its primaryType and subType
- private Document addType(Document doc, Properties metaData, String url) {
+ private Document addType(Document doc, ContentProperties metaData, String url) {
MimeType mimeType = null;
String contentType = metaData.getProperty("content-type");
if (contentType == null) {
@@ -259,7 +260,7 @@
}
}
- private Document resetTitle(Document doc, Properties metaData, String url) {
+ private Document resetTitle(Document doc, ContentProperties metaData, String url) {
String contentDisposition = metaData.getProperty("content-disposition");
if (contentDisposition == null)
return doc;
@@ -284,8 +285,8 @@
// (*) empty header value
// Note: the original metaData should be kept intact,
// because there is a benefit to preserve whatever comes from server.
- private Properties normalizeMeta(Properties old) {
- Properties normalized = new Properties();
+ private ContentProperties normalizeMeta(ContentProperties old) {
+ ContentProperties normalized = new ContentProperties();
for (Enumeration e = old.propertyNames(); e.hasMoreElements();) {
String key = (String) e.nextElement();
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Sat Dec 10 16:36:57 2005
@@ -15,8 +15,7 @@
*/
package org.apache.nutch.analysis.lang;
-// JDK imports
-import java.util.Properties;
+
// JUnit imports
import junit.framework.TestCase;
@@ -26,6 +25,7 @@
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
public class TestHTMLLanguageParser extends TestCase {
@@ -122,7 +122,7 @@
private Content getContent(String text) {
- Properties p = new Properties();
+ ContentProperties p = new ContentProperties();
p.put("Content-Type", "text/html");
Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Sat Dec 10 16:36:57 2005
@@ -17,6 +17,7 @@
package org.apache.nutch.parse.ext;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
@@ -155,7 +156,7 @@
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
- Properties metaData = new Properties();
+ ContentProperties metaData = new ContentProperties();
metaData.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Sat Dec 10 16:36:57 2005
@@ -31,6 +31,7 @@
import org.apache.html.dom.*;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.*;
import org.apache.nutch.parse.*;
@@ -106,7 +107,7 @@
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
// check that contentType is one we can handle
String contentType = content.getContentType();
@@ -271,7 +272,7 @@
in.readFully(bytes);
Parse parse = new HtmlParser().getParse(new Content(url,url,
bytes,"text/html",
- new Properties()));
+ new ContentProperties()));
System.out.println("data: "+parse.getData());
System.out.println("text: "+parse.getText());
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Dec 10 16:36:57 2005
@@ -22,6 +22,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
@@ -56,7 +57,7 @@
walk(doc, parse, metaTags, url, outlinks);
if (outlinks.size() > 0) {
Outlink[] old = parse.getData().getOutlinks();
- Properties metadata = parse.getData().getMetadata();
+ ContentProperties metadata = parse.getData().getMetadata();
String title = parse.getData().getTitle();
List list = Arrays.asList(old);
outlinks.addAll(list);
@@ -136,7 +137,7 @@
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(c.getMetadata());
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, metadata);
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Sat Dec 10 16:36:57 2005
@@ -30,6 +30,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
/**
@@ -73,7 +74,7 @@
byte[] raw = getRawBytes(new File(file));
- Properties prop = new Properties();
+ ContentProperties prop = new ContentProperties();
prop.setProperty("Content-Length", "" + raw.length);
Content content = new Content(file, file, raw, MIME_TYPE, prop);
@@ -130,7 +131,7 @@
}
// collect meta data
- final Properties metadata = new Properties();
+ final ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
if (properties != null) {
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Sat Dec 10 16:36:57 2005
@@ -17,6 +17,7 @@
package org.apache.nutch.parse.msword;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
@@ -102,7 +103,7 @@
}
// collect meta data
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
if(properties != null) {
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Sat Dec 10 16:36:57 2005
@@ -26,6 +26,7 @@
import org.pdfbox.exceptions.InvalidPasswordException;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
@@ -165,7 +166,7 @@
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Sat Dec 10 16:36:57 2005
@@ -19,13 +19,14 @@
import java.util.Properties;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.*;
public class TextParser implements Parser {
public Parse getParse(Content content) {
// copy content meta data through
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata());
//ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Sat Dec 10 16:36:57 2005
@@ -31,6 +31,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
/**
@@ -87,7 +88,7 @@
}
// collect meta data
- final Properties metadata = new Properties();
+ final ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
if (resultText == null) {
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Sat Dec 10 16:36:57 2005
@@ -33,6 +33,7 @@
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.util.mime.MimeTypes;
@@ -84,7 +85,7 @@
// Trying to resolve the Mime-Type
String contentType = MIME.getMimeType(fname).getName();
try {
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
metadata.setProperty("Content-Type", contentType);
Content content = new Content(newurl, base, b, contentType, metadata);
Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Sat Dec 10 16:36:57 2005
@@ -25,6 +25,7 @@
// Nutch imports
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
/************************************
@@ -59,7 +60,7 @@
private String base;
private byte[] content;
private int code;
- private Properties headers = new Properties();
+ private ContentProperties headers = new ContentProperties();
private final File file;
Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Sat Dec 10 16:36:57 2005
@@ -25,6 +25,7 @@
import org.apache.commons.net.ftp.parser.ParserInitializationException;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import java.net.InetAddress;
import java.net.URL;
@@ -59,7 +60,7 @@
private String base;
private byte[] content;
private int code;
- private Properties headers = new Properties();
+ private ContentProperties headers = new ContentProperties();
private final Ftp ftp;
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Sat Dec 10 16:36:57 2005
@@ -32,6 +32,7 @@
import java.util.logging.Level;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.util.GZIPUtils;
@@ -44,7 +45,7 @@
private String base;
private byte[] content;
private int code;
- private Properties headers = new Properties();
+ private ContentProperties headers = new ContentProperties();
/** Returns the response code. */
public int getCode() { return code; }
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Sat Dec 10 16:36:57 2005
@@ -11,6 +11,8 @@
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
+
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
@@ -44,7 +46,7 @@
private HttpAuthenticationFactory() { }
- public static HttpAuthentication findAuthentication(Properties header) {
+ public static HttpAuthentication findAuthentication(ContentProperties header) {
if (header == null) return null;
try {
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java Sat Dec 10 16:36:57 2005
@@ -10,17 +10,18 @@
import java.util.Iterator;
import java.util.logging.Logger;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
/**
- * An extension to {@link Properties} which allows multiple values for a single key.
+ * An extension to {@link ContentProperties} which allows multiple values for a single key.
* The {@link #get(Object)} method may return a single value or a
* {@link java.util.Collection} of values.
*
* @author Matt Tencati
*/
-public class MultiProperties extends Properties {
+public class MultiProperties extends ContentProperties {
public static final Logger LOG = LogFormatter
.getLogger("net.nutch.protocol.http.MultiProperties");
@@ -31,7 +32,7 @@
*/
public MultiProperties() {
super();
- multiMap = new TreeMap();
+ multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
}
/**
@@ -41,7 +42,7 @@
*/
public MultiProperties(Properties defaults) {
super(defaults);
- multiMap = new TreeMap();
+ multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
}
/**
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Sat Dec 10 16:36:57 2005
@@ -16,10 +16,8 @@
package org.apache.nutch.parse;
-import java.io.*;
-import java.util.Properties;
import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
+import org.apache.nutch.protocol.ContentProperties;
import junit.framework.TestCase;
/** Unit tests for ParseData. */
@@ -36,7 +34,7 @@
new Outlink("http://bar.com/", "Bar")
};
- Properties metaData = new Properties();
+ ContentProperties metaData = new ContentProperties();
metaData.put("Language", "en/us");
metaData.put("Charset", "UTF-8");
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Sat Dec 10 16:36:57 2005
@@ -16,10 +16,7 @@
package org.apache.nutch.protocol;
-import java.io.*;
-import java.util.Properties;
import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
import junit.framework.TestCase;
/** Unit tests for Content. */
@@ -33,7 +30,7 @@
String url = "http://www.foo.com/";
- Properties metaData = new Properties();
+ ContentProperties metaData = new ContentProperties();
metaData.put("Host", "www.foo.com");
metaData.put("Content-Type", "text/html");
@@ -41,12 +38,14 @@
metaData);
TestWritable.testWritable(r);
+ assertEquals("text/html", r.getMetadata().get("Content-Type"));
+ assertEquals("text/html", r.getMetadata().get("content-type"));
}
/** Unit tests for getContentType(String, String, byte[]) method. */
public void testGetContentType() throws Exception {
Content c = null;
- Properties p = new Properties();
+ ContentProperties p = new ContentProperties();
c = new Content("http://www.foo.com/",
"http://www.foo.com/",
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java Sat Dec 10 16:36:57 2005
@@ -32,6 +32,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.protocol.ProtocolStatus;
import junit.framework.TestCase;
@@ -90,7 +91,7 @@
content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n");
}
content.append("</body></html>");
- Properties meta = new Properties();
+ ContentProperties meta = new ContentProperties();
meta.setProperty("Content-Type", "text/html");
meta.setProperty("Host", "http://localhost");
meta.setProperty("Connection", "Keep-alive, close");
Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cached.jsp?rev=355828&r1=355827&r2=355828&view=diff
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cached.jsp Sat Dec 10 16:36:57 2005
@@ -7,6 +7,7 @@
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.parse.ParseData"
+ import="org.apache.nutch.protocol.ContentProperties"
%><%
NutchBean bean = NutchBean.get(application);
bean.LOG.info("cache request from " + request.getRemoteAddr());
@@ -19,7 +20,7 @@
ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
.getLocale().getLanguage();
- Properties metaData = bean.getParseData(details).getMetadata();
+ ContentProperties metaData = bean.getParseData(details).getMetadata();
String content = null;
String contentType = (String) metaData.get("Content-Type");