You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/12/15 19:49:31 UTC
svn commit: r357056 - in /lucene/nutch/branches/mapred/src:
java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/
java/org/apache/nutch/parse/ java/org/apache/nutch/protocol/
java/org/apache/nutch/servlet/ java/org/apache/nutch/tools/ java/org/ap...
Author: cutting
Date: Thu Dec 15 10:49:12 2005
New Revision: 357056
URL: http://svn.apache.org/viewcvs?rev=357056&view=rev
Log:
svn merge -r 326936:357049 https://svn.apache.org/repos/asf/lucene/nutch/trunk
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ContentProperties.java
- copied unchanged from r356540, lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java
lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
lucene/nutch/branches/mapred/src/web/jsp/cached.jsp
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Dec 15 10:49:12 2005
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.io.File;
-import java.util.Properties;
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
@@ -193,7 +192,7 @@
if (content == null) {
String url = key.toString();
- content = new Content(url, url, new byte[0], "", new Properties());
+ content = new Content(url,url,new byte[0],"",new ContentProperties());
}
content.getMetadata().setProperty // add digest to metadata
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Thu Dec 15 10:49:12 2005
@@ -26,6 +26,7 @@
import org.apache.nutch.util.*;
import org.apache.nutch.mapred.*;
import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.*;
import org.apache.nutch.analysis.*;
import org.apache.nutch.indexer.*;
@@ -187,7 +188,7 @@
}
Document doc = new Document();
- Properties meta = parseData.getMetadata();
+ ContentProperties meta = parseData.getMetadata();
String[] anchors = inlinks!=null ? inlinks.getAnchors() : new String[0];
// add segment, used to map from merged index back to segment files
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Dec 15 10:49:12 2005
@@ -234,7 +234,7 @@
MD5Hash hash = null;
String url = fle.getPage().getURL().toString();
if (content == null) {
- content = new Content(url, url, new byte[0], "", new Properties());
+ content = new Content(url, url, new byte[0], "", new ContentProperties());
hash = MD5Hash.digest(url);
} else {
hash = MD5Hash.digest(content.getContent());
@@ -263,7 +263,7 @@
+ status.toString());
outputPage(new FetcherOutput(fle, hash, protocolStatus),
content, new ParseText(""),
- new ParseData(status, "", new Outlink[0], new Properties()));
+ new ParseData(status, "", new Outlink[0], new ContentProperties()));
}
return status;
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java Thu Dec 15 10:49:12 2005
@@ -21,6 +21,7 @@
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.tools.UpdateDatabaseTool;
@@ -34,12 +35,12 @@
private String title;
private Outlink[] outlinks;
- private Properties metadata;
+ private ContentProperties metadata;
private ParseStatus status;
public ParseData() {}
- public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) {
+ public ParseData(ParseStatus status, String title, Outlink[] outlinks, ContentProperties metadata) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
@@ -62,7 +63,7 @@
/** Other page properties. This is the place to find format-specific
* properties. Different parser implementations for different content types
* will populate this differently. */
- public Properties getMetadata() { return metadata; }
+ public ContentProperties getMetadata() { return metadata; }
/** Return the value of a metadata property. */
public String get(String name) { return getMetadata().getProperty(name); }
@@ -94,7 +95,7 @@
}
int propertyCount = in.readInt(); // read metadata
- metadata = new Properties();
+ metadata = new ContentProperties();
for (int i = 0; i < propertyCount; i++) {
metadata.put(UTF8.readString(in), UTF8.readString(in));
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java Thu Dec 15 10:49:12 2005
@@ -12,6 +12,7 @@
import org.apache.nutch.io.VersionedWritable;
import org.apache.nutch.io.WritableUtils;
+import org.apache.nutch.protocol.ContentProperties;
/**
* @author Andrzej Bialecki <ab@getopt.org>
@@ -230,7 +231,7 @@
private ParseData data = null;
public EmptyParseImpl(ParseStatus status) {
- data = new ParseData(status, "", new Outlink[0], new Properties());
+ data = new ParseData(status, "", new Outlink[0], new ContentProperties());
}
public ParseData getData() {
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Thu Dec 15 10:49:12 2005
@@ -22,6 +22,9 @@
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;
+import org.apache.nutch.util.mime.MimeType;
+import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.util.mime.MimeTypeException;
public final class Content extends CompressedWritable {
@@ -29,28 +32,35 @@
private final static byte VERSION = 1;
+ /** A flag that tells if magic resolution must be performed */
+ private final static boolean MAGIC =
+ NutchConf.get().getBoolean("mime.type.magic", true);
+
+ /** Get the MimeTypes resolver instance. */
+ private final static MimeTypes MIME =
+ MimeTypes.get(NutchConf.get().get("mime.types.file"));
+
private byte version;
private String url;
private String base;
private byte[] content;
private String contentType;
- private Properties metadata;
+ private ContentProperties metadata;
public Content() {}
public Content(String url, String base, byte[] content, String contentType,
- Properties metadata){
+ ContentProperties metadata){
if (url == null) throw new IllegalArgumentException("null url");
if (base == null) throw new IllegalArgumentException("null base");
if (content == null) throw new IllegalArgumentException("null content");
- if (contentType == null) throw new IllegalArgumentException("null type");
if (metadata == null) throw new IllegalArgumentException("null metadata");
this.url = url;
this.base = base;
this.content = content;
- this.contentType = contentType;
+ this.contentType = getContentType(contentType, url, content);
this.metadata = metadata;
}
@@ -68,7 +78,7 @@
contentType = UTF8.readString(in); // read contentType
int propertyCount = in.readInt(); // read metadata
- metadata = new Properties();
+ metadata = new ContentProperties();
for (int i = 0; i < propertyCount; i++) {
metadata.put(UTF8.readString(in), UTF8.readString(in));
}
@@ -142,7 +152,7 @@
}
/** Other protocol-specific data. */
- public Properties getMetadata() {
+ public ContentProperties getMetadata() {
ensureInflated();
return metadata;
}
@@ -213,4 +223,33 @@
nfs.close();
}
}
+
+ private String getContentType(String typeName, String url, byte[] data) {
+
+ MimeType type = null;
+ try {
+ typeName = MimeType.clean(typeName);
+ type = typeName == null ? null : MIME.forName(typeName);
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ }
+
+ if (typeName == null || type == null || !type.matches(url)) {
+ // If no mime-type header, or cannot find a corresponding registered
+ // mime-type, or the one found doesn't match the url pattern
+ // it shouldbe, then guess a mime-type from the url pattern
+ type = MIME.getMimeType(url);
+ typeName = type == null ? typeName : type.getName();
+ }
+ if (typeName == null || type == null ||
+ (MAGIC && type.hasMagic() && !type.matches(data))) {
+ // If no mime-type already found, or the one found doesn't match
+ // the magic bytes it should be, then, guess a mime-type from the
+ // document content (magic bytes)
+ type = MIME.getMimeType(data);
+ typeName = type == null ? typeName : type.getName();
+ }
+ return typeName;
+ }
+
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/servlet/Cached.java Thu Dec 15 10:49:12 2005
@@ -16,6 +16,7 @@
package org.apache.nutch.servlet;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.searcher.NutchBean;
import org.apache.nutch.searcher.Hit;
import org.apache.nutch.searcher.HitDetails;
@@ -76,7 +77,7 @@
byte[] bytes = bean.getContent(details);
// pass all original headers? only these for now.
- Properties metaData = bean.getParseData(details).getMetadata();
+ ContentProperties metaData = bean.getParseData(details).getMetadata();
String contentType = (String) metaData.get("Content-Type");
//String lastModified = (String) metaData.get("Last-Modified");
//String contentLength = (String) metaData.get("Content-Length");
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java Thu Dec 15 10:49:12 2005
@@ -240,7 +240,7 @@
}
outputPage(new ParseText(""),
new ParseData(new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_MISSING_CONTENT),
- "", new Outlink[0], new Properties()));
+ "", new Outlink[0], new ContentProperties()));
}
}
@@ -250,7 +250,7 @@
return;
}
outputPage(new ParseText(""),
- new ParseData(status, "", new Outlink[0], new Properties()));
+ new ParseData(status, "", new Outlink[0], new ContentProperties()));
}
private void outputPage
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeType.java Thu Dec 15 10:49:12 2005
@@ -227,11 +227,21 @@
return minLength;
}
- boolean hasMagic() {
+ public boolean hasMagic() {
return (magics.size() > 0);
}
- boolean matches(byte[] data) {
+ public boolean matches(String url) {
+ boolean match = false;
+ int index = url.lastIndexOf('.');
+ if ((index != -1) && (index < url.length()-1)) {
+ // There's an extension, so try to find if it matches mines
+ match = extensions.contains(url.substring(index + 1));
+ }
+ return match;
+ }
+
+ public boolean matches(byte[] data) {
if (!hasMagic()) { return false; }
Magic tested = null;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/mime/MimeTypes.java Thu Dec 15 10:49:12 2005
@@ -39,7 +39,10 @@
public final static String DEFAULT = "application/octet-stream";
/** All the registered MimeTypes */
- private ArrayList types = new ArrayList();
+ private ArrayList types = new ArrayList();
+
+ /** All the registered MimeType indexed by name */
+ private HashMap typesIdx = new HashMap();
/** MimeTypes indexed on the file extension */
private Map extIdx = new HashMap();
@@ -211,7 +214,14 @@
}
return mimeType;
}
-
+
+ /**
+ * Return a MimeType from its name.
+ */
+ public MimeType forName(String name) {
+ return (MimeType) typesIdx.get(name);
+ }
+
/**
* Return the minimum length of data to provide to analyzing methods
* based on the document's content in order to check all the known
@@ -241,6 +251,7 @@
* @param type is the mime-type to add.
*/
void add(MimeType type) {
+ typesIdx.put(type.getName(), type);
types.add(type);
// Update minLentgth
minLength = Math.max(minLength, type.getMinLength());
Modified: lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Thu Dec 15 10:49:12 2005
@@ -18,6 +18,7 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.NutchConf;
import java.util.*;
@@ -51,7 +52,7 @@
}
/** Scan the document adding attributes to metadata.*/
- public static void walk(Node doc, URL base, Properties metadata)
+ public static void walk(Node doc, URL base, ContentProperties metadata)
throws ParseException {
// walk the DOM tree, scanning for license data
Modified: lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Thu Dec 15 10:49:12 2005
@@ -19,6 +19,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import java.util.Properties;
import java.io.*;
@@ -56,10 +57,10 @@
byte[] bytes = out.toByteArray();
Content content =
- new Content(url, url, bytes, contentType, new Properties());
+ new Content(url, url, bytes, contentType, new ContentProperties());
Parse parse = ParseUtil.parseByParserId("parse-html",content);
- Properties metadata = parse.getData().getMetadata();
+ ContentProperties metadata = parse.getData().getMetadata();
assertEquals(license, metadata.get("License-Url"));
assertEquals(location, metadata.get("License-Location"));
assertEquals(type, metadata.get("Work-Type"));
Modified: lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Dec 15 10:49:12 2005
@@ -29,6 +29,7 @@
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
@@ -86,7 +87,7 @@
String url = fo.getUrl().toString();
// normalize metaData (see note in the method below).
- Properties metaData = normalizeMeta(parse.getData().getMetadata());
+ ContentProperties metaData = normalizeMeta(parse.getData().getMetadata());
addTime(doc, metaData, url, fo);
@@ -101,7 +102,7 @@
// Add time related meta info. Add last-modified if present. Index date as
// last-modified, or, if that's not present, use fetch time.
- private Document addTime(Document doc, Properties metaData, String url,
+ private Document addTime(Document doc, ContentProperties metaData, String url,
FetcherOutput fo) {
long time = -1;
@@ -169,7 +170,7 @@
}
// Add Content-Length
- private Document addLength(Document doc, Properties metaData, String url) {
+ private Document addLength(Document doc, ContentProperties metaData, String url) {
String contentLength = metaData.getProperty("content-length");
if (contentLength != null)
@@ -179,7 +180,7 @@
}
// Add Content-Type and its primaryType and subType
- private Document addType(Document doc, Properties metaData, String url) {
+ private Document addType(Document doc, ContentProperties metaData, String url) {
MimeType mimeType = null;
String contentType = metaData.getProperty("content-type");
if (contentType == null) {
@@ -259,7 +260,7 @@
}
}
- private Document resetTitle(Document doc, Properties metaData, String url) {
+ private Document resetTitle(Document doc, ContentProperties metaData, String url) {
String contentDisposition = metaData.getProperty("content-disposition");
if (contentDisposition == null)
return doc;
@@ -284,8 +285,8 @@
// (*) empty header value
// Note: the original metaData should be kept intact,
// because there is a benefit to preserve whatever comes from server.
- private Properties normalizeMeta(Properties old) {
- Properties normalized = new Properties();
+ private ContentProperties normalizeMeta(ContentProperties old) {
+ ContentProperties normalized = new ContentProperties();
for (Enumeration e = old.propertyNames(); e.hasMoreElements();) {
String key = (String) e.nextElement();
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Thu Dec 15 10:49:12 2005
@@ -15,8 +15,7 @@
*/
package org.apache.nutch.analysis.lang;
-// JDK imports
-import java.util.Properties;
+
// JUnit imports
import junit.framework.TestCase;
@@ -26,6 +25,7 @@
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
public class TestHTMLLanguageParser extends TestCase {
@@ -122,7 +122,7 @@
private Content getContent(String text) {
- Properties p = new Properties();
+ ContentProperties p = new ContentProperties();
p.put("Content-Type", "text/html");
Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Thu Dec 15 10:49:12 2005
@@ -17,6 +17,7 @@
package org.apache.nutch.parse.ext;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
@@ -155,7 +156,7 @@
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
- Properties metaData = new Properties();
+ ContentProperties metaData = new ContentProperties();
metaData.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Thu Dec 15 10:49:12 2005
@@ -31,6 +31,7 @@
import org.apache.html.dom.*;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.*;
import org.apache.nutch.parse.*;
@@ -106,19 +107,14 @@
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (!"".equals(contentType) && !contentType.startsWith("text/html"))
- return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not text/html: " + contentType).getEmptyParse();
-
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
+ String contentType = content.getMetadata().getProperty("Content-Type");
String encoding = StringUtil.parseCharacterEncoding(contentType);
if (encoding!=null) {
metadata.put("OriginalCharEncoding", encoding);
@@ -271,7 +267,7 @@
in.readFully(bytes);
Parse parse = new HtmlParser().getParse(new Content(url,url,
bytes,"text/html",
- new Properties()));
+ new ContentProperties()));
System.out.println("data: "+parse.getData());
System.out.println("text: "+parse.getText());
Modified: lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Thu Dec 15 10:49:12 2005
@@ -22,6 +22,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
@@ -56,7 +57,7 @@
walk(doc, parse, metaTags, url, outlinks);
if (outlinks.size() > 0) {
Outlink[] old = parse.getData().getOutlinks();
- Properties metadata = parse.getData().getMetadata();
+ ContentProperties metadata = parse.getData().getMetadata();
String title = parse.getData().getTitle();
List list = Arrays.asList(old);
outlinks.addAll(list);
@@ -136,7 +137,7 @@
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(c.getMetadata());
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, metadata);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Thu Dec 15 10:49:12 2005
@@ -30,6 +30,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
/**
@@ -73,7 +74,7 @@
byte[] raw = getRawBytes(new File(file));
- Properties prop = new Properties();
+ ContentProperties prop = new ContentProperties();
prop.setProperty("Content-Length", "" + raw.length);
Content content = new Content(file, file, raw, MIME_TYPE, prop);
@@ -88,15 +89,6 @@
*/
public Parse getParse(final Content content) {
- // check that contentType is one we can handle
- final String contentType = content.getContentType();
-
- if (contentType != null && !contentType.startsWith(MIME_TYPE)) {
- return new ParseStatus(ParseStatus.FAILED,
- ParseStatus.FAILED_INVALID_FORMAT, "Content-Type is not ["
- + MIME_TYPE + "] was: " + contentType).getEmptyParse();
- }
-
String plainText = null;
String title = null;
Outlink[] outlinks = null;
@@ -130,7 +122,7 @@
}
// collect meta data
- final Properties metadata = new Properties();
+ final ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
if (properties != null) {
@@ -169,4 +161,4 @@
}
}
-}
\ No newline at end of file
+}
Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Thu Dec 15 10:49:12 2005
@@ -17,6 +17,7 @@
package org.apache.nutch.parse.msword;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
@@ -56,12 +57,6 @@
public Parse getParse(Content content) {
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (contentType != null && !contentType.startsWith("application/msword"))
- return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not application/msword: " + contentType).getEmptyParse();
-
String text = null;
String title = null;
Properties properties = null;
@@ -102,7 +97,7 @@
}
// collect meta data
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
if(properties != null) {
Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Thu Dec 15 10:49:12 2005
@@ -26,6 +26,7 @@
import org.pdfbox.exceptions.InvalidPasswordException;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
@@ -83,12 +84,6 @@
public Parse getParse(Content content) {
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (contentType != null && !contentType.startsWith("application/pdf"))
- return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not application/pdf: " + contentType).getEmptyParse();
-
// in memory representation of pdf file
PDDocument pdf = null;
@@ -165,7 +160,7 @@
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text);
// collect meta data
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Thu Dec 15 10:49:12 2005
@@ -101,15 +101,6 @@
*/
public Parse getParse(Content content) {
- // check that contentType is one we can handle
- String contentType = content.getContentType();
- if (contentType != null
- && (!contentType.startsWith("text/xml") && !contentType
- .startsWith("application/rss+xml")))
- return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not text/xml or application/rss+xml: "
- + contentType).getEmptyParse();
-
List theRSSChannels = null;
try {
Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Thu Dec 15 10:49:12 2005
@@ -19,13 +19,14 @@
import java.util.Properties;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.*;
public class TextParser implements Parser {
public Parse getParse(Content content) {
// copy content meta data through
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata());
//ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Thu Dec 15 10:49:12 2005
@@ -31,6 +31,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
/**
@@ -47,13 +48,6 @@
public Parse getParse(final Content content) {
- // check that contentType is one we can handle
- final String contentType = content.getContentType();
- if (contentType != null && !contentType.startsWith("application/zip")) {
- return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
- "Content-Type not application/zip: " + contentType).getEmptyParse();
- }
-
String resultText = null;
String resultTitle = null;
Outlink[] outlinks = null;
@@ -87,7 +81,7 @@
}
// collect meta data
- final Properties metadata = new Properties();
+ final ContentProperties metadata = new ContentProperties();
metadata.putAll(content.getMetadata()); // copy through
if (resultText == null) {
Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Thu Dec 15 10:49:12 2005
@@ -33,6 +33,7 @@
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
import org.apache.nutch.util.mime.MimeTypes;
@@ -84,7 +85,7 @@
// Trying to resolve the Mime-Type
String contentType = MIME.getMimeType(fname).getName();
try {
- Properties metadata = new Properties();
+ ContentProperties metadata = new ContentProperties();
metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
metadata.setProperty("Content-Type", contentType);
Content content = new Content(newurl, base, b, contentType, metadata);
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Thu Dec 15 10:49:12 2005
@@ -25,9 +25,7 @@
// Nutch imports
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.protocol.ContentProperties;
/************************************
@@ -58,20 +56,11 @@
***********************************/
public class FileResponse {
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
-
private String orig;
private String base;
private byte[] content;
private int code;
- private Properties headers = new Properties();
+ private ContentProperties headers = new ContentProperties();
private final File file;
@@ -201,15 +190,8 @@
hdrs.put("Last-Modified",
this.file.httpDateFormat.toString(f.lastModified()));
- MimeType contentType = null;
- if (MAGIC) {
- contentType = MIME.getMimeType(f.getName(), this.content);
- } else {
- contentType = MIME.getMimeType(f.getName());
- }
- if (contentType != null) {
- hdrs.put("Content-Type", contentType.getName());
- }
+ hdrs.put("Content-Type", ""); // No Content-Type at file protocol level
+
this.headers.putAll(hdrs);
// response code
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Thu Dec 15 10:49:12 2005
@@ -25,10 +25,7 @@
import org.apache.commons.net.ftp.parser.ParserInitializationException;
import org.apache.nutch.protocol.Content;
-
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
+import org.apache.nutch.protocol.ContentProperties;
import java.net.InetAddress;
import java.net.URL;
@@ -58,20 +55,12 @@
* @author John Xing
***********************************/
public class FtpResponse {
-
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
+
private String orig;
private String base;
private byte[] content;
private int code;
- private Properties headers = new Properties();
+ private ContentProperties headers = new ContentProperties();
private final Ftp ftp;
@@ -314,16 +303,6 @@
ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
this.content = os.toByteArray();
- MimeType contentType = null;
- if (MAGIC) {
- contentType = MIME.getMimeType(path, this.content);
- } else {
- contentType = MIME.getMimeType(path);
- }
- if (contentType != null) {
- this.headers.put("Content-Type", contentType.getName());
- }
-
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
// this.httpAccounting.incrementBytesSent(path.length());
@@ -359,16 +338,6 @@
this.headers.put("Last-Modified",
ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
this.content = os.toByteArray();
-
- MimeType contentType = null;
- if (MAGIC) {
- contentType = MIME.getMimeType(path, this.content);
- } else {
- contentType = MIME.getMimeType(path);
- }
- if (contentType != null) {
- this.headers.put("Content-Type", contentType.getName());
- }
// // approximate bytes sent and read
// if (this.httpAccounting != null) {
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Thu Dec 15 10:49:12 2005
@@ -32,31 +32,20 @@
import java.util.logging.Level;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.util.GZIPUtils;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
/** An HTTP response. */
public class HttpResponse {
-
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
private String orig;
private String base;
private byte[] content;
private int code;
- private Properties headers = new Properties();
+ private ContentProperties headers = new ContentProperties();
/** Returns the response code. */
public int getCode() { return code; }
@@ -69,21 +58,9 @@
public byte[] getContent() { return content; }
public Content toContent() {
- String contentType = getHeader("Content-Type");
- if (contentType == null) {
- MimeType type = null;
- if (MAGIC) {
- type = MIME.getMimeType(orig, content);
- } else {
- type = MIME.getMimeType(orig);
- }
- if (type != null) {
- contentType = type.getName();
- } else {
- contentType = "";
- }
- }
- return new Content(orig, base, content, contentType, headers);
+ return new Content(orig, base, content,
+ getHeader("Content-Type"),
+ headers);
}
public HttpResponse(URL url) throws ProtocolException, IOException {
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java Thu Dec 15 10:49:12 2005
@@ -11,6 +11,8 @@
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
+
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
@@ -44,7 +46,7 @@
private HttpAuthenticationFactory() { }
- public static HttpAuthentication findAuthentication(Properties header) {
+ public static HttpAuthentication findAuthentication(ContentProperties header) {
if (header == null) return null;
try {
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Thu Dec 15 10:49:12 2005
@@ -4,9 +4,6 @@
package org.apache.nutch.protocol.httpclient;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpVersion;
@@ -24,13 +21,6 @@
* An HTTP response.
*/
public class HttpResponse {
- /** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
-
- /** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
private String orig;
@@ -63,22 +53,10 @@
}
public Content toContent() {
- String contentType = getHeader("Content-Type");
- if (contentType == null) {
- MimeType type = null;
- if (MAGIC) {
- type = MIME.getMimeType(orig, content);
- } else {
- type = MIME.getMimeType(orig);
- }
- if (type != null) {
- contentType = type.getName();
- } else {
- contentType = "";
- }
- }
- if (content == null) content = EMPTY_CONTENT;
- return new Content(orig, base, content, contentType, headers);
+ return new Content(orig, base,
+ (content == null ? EMPTY_CONTENT : content),
+ getHeader("Content-Type"),
+ headers);
}
public HttpResponse(URL url) throws IOException {
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java Thu Dec 15 10:49:12 2005
@@ -10,17 +10,18 @@
import java.util.Iterator;
import java.util.logging.Logger;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.util.LogFormatter;
/**
- * An extension to {@link Properties} which allows multiple values for a single key.
+ * An extension to {@link ContentProperties} which allows multiple values for a single key.
* The {@link #get(Object)} method may return a single value or a
* {@link java.util.Collection} of values.
*
* @author Matt Tencati
*/
-public class MultiProperties extends Properties {
+public class MultiProperties extends ContentProperties {
public static final Logger LOG = LogFormatter
.getLogger("net.nutch.protocol.http.MultiProperties");
@@ -31,7 +32,7 @@
*/
public MultiProperties() {
super();
- multiMap = new TreeMap();
+ multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
}
/**
@@ -41,7 +42,7 @@
*/
public MultiProperties(Properties defaults) {
super(defaults);
- multiMap = new TreeMap();
+ multiMap = new TreeMap(String.CASE_INSENSITIVE_ORDER);
}
/**
Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java Thu Dec 15 10:49:12 2005
@@ -16,10 +16,8 @@
package org.apache.nutch.parse;
-import java.io.*;
-import java.util.Properties;
import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
+import org.apache.nutch.protocol.ContentProperties;
import junit.framework.TestCase;
/** Unit tests for ParseData. */
@@ -36,7 +34,7 @@
new Outlink("http://bar.com/", "Bar")
};
- Properties metaData = new Properties();
+ ContentProperties metaData = new ContentProperties();
metaData.put("Language", "en/us");
metaData.put("Charset", "UTF-8");
Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/protocol/TestContent.java Thu Dec 15 10:49:12 2005
@@ -16,10 +16,7 @@
package org.apache.nutch.protocol;
-import java.io.*;
-import java.util.Properties;
import org.apache.nutch.io.*;
-import org.apache.nutch.pagedb.*;
import junit.framework.TestCase;
/** Unit tests for Content. */
@@ -33,7 +30,7 @@
String url = "http://www.foo.com/";
- Properties metaData = new Properties();
+ ContentProperties metaData = new ContentProperties();
metaData.put("Host", "www.foo.com");
metaData.put("Content-Type", "text/html");
@@ -41,6 +38,62 @@
metaData);
TestWritable.testWritable(r);
+ assertEquals("text/html", r.getMetadata().get("Content-Type"));
+ assertEquals("text/html", r.getMetadata().get("content-type"));
+ }
+
+ /** Unit tests for getContentType(String, String, byte[]) method. */
+ public void testGetContentType() throws Exception {
+ Content c = null;
+ ContentProperties p = new ContentProperties();
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ "text/html; charset=UTF-8", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.html",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ "", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.html",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ null, p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "<html></html>".getBytes("UTF8"),
+ "", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.html",
+ "http://www.foo.com/",
+ "<html></html>".getBytes("UTF8"),
+ "text/plain", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/foo.png",
+ "http://www.foo.com/",
+ "<html></html>".getBytes("UTF8"),
+ "text/plain", p);
+ assertEquals("text/html", c.getContentType());
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ "", p);
+ assertEquals("", c.getContentType());
+
+ c = new Content("http://www.foo.com/",
+ "http://www.foo.com/",
+ "".getBytes("UTF8"),
+ null, p);
+ assertNull(c.getContentType());
}
}
Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java Thu Dec 15 10:49:12 2005
@@ -32,6 +32,7 @@
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
import org.apache.nutch.protocol.ProtocolStatus;
import junit.framework.TestCase;
@@ -90,7 +91,7 @@
content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n");
}
content.append("</body></html>");
- Properties meta = new Properties();
+ ContentProperties meta = new ContentProperties();
meta.setProperty("Content-Type", "text/html");
meta.setProperty("Host", "http://localhost");
meta.setProperty("Connection", "Keep-alive, close");
Modified: lucene/nutch/branches/mapred/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/web/jsp/cached.jsp?rev=357056&r1=357055&r2=357056&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/branches/mapred/src/web/jsp/cached.jsp Thu Dec 15 10:49:12 2005
@@ -7,6 +7,7 @@
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.parse.ParseData"
+ import="org.apache.nutch.protocol.ContentProperties"
%><%
NutchBean bean = NutchBean.get(application);
bean.LOG.info("cache request from " + request.getRemoteAddr());
@@ -19,7 +20,7 @@
ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
.getLocale().getLanguage();
- Properties metaData = bean.getParseData(details).getMetadata();
+ ContentProperties metaData = bean.getParseData(details).getMetadata();
String content = null;
String contentType = (String) metaData.get("Content-Type");