You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2007/10/09 02:23:39 UTC
svn commit: r583016 - in /lucene/nutch/trunk: ./ conf/ lib/
src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/
src/java/org/apache/nutch/util/mime/
src/plugin/index-more/src/java/org/apache/nutch/indexer/more/
src/plugin/parse-zip/src...
Author: mattmann
Date: Mon Oct 8 17:23:38 2007
New Revision: 583016
URL: http://svn.apache.org/viewvc?rev=583016&view=rev
Log:
- fix for NUTCH-562
Added:
lucene/nutch/trunk/conf/tika-mimetypes.xml
lucene/nutch/trunk/lib/tika-0.1-dev.jar (with props)
Removed:
lucene/nutch/trunk/conf/mime-types.dtd
lucene/nutch/trunk/conf/mime-types.xml
lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/
lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Oct 8 17:23:38 2007
@@ -144,6 +144,9 @@
49. NUTCH-508 - ${hadoop.log.dir} and ${hadoop.log.file} are not propagated
to the tasktracker. (Mathijs Homminga, Emmanuel Joke via dogacan)
+
+50. NUTCH-562 - Port mime type framework to use Tika mime detection framework.
+ (mattmann)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Oct 8 17:23:38 2007
@@ -784,7 +784,7 @@
<property>
<name>mime.types.file</name>
- <value>mime-types.xml</value>
+ <value>tika-mimetypes.xml</value>
<description>Name of file in CLASSPATH containing filename extension and
magic sequence to mime types mapping information</description>
</property>
Added: lucene/nutch/trunk/conf/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=583016&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/tika-mimetypes.xml (added)
+++ lucene/nutch/trunk/conf/tika-mimetypes.xml Mon Oct 8 17:23:38 2007
@@ -0,0 +1,368 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ Description: This xml file defines the valid mime types used by Tika.
+ The mime types within this file are based on the types in the mime-types.xml
+ file available in Apache Nutch.
+-->
+
+<mime-info>
+
+ <mime-type type="text/plain">
+ <magic priority="50">
+ <match value="This is TeX," type="string" offset="0" />
+ <match value="This is METAFONT," type="string" offset="0" />
+ </magic>
+ <glob pattern="*.txt" />
+ <glob pattern="*.asc" />
+ </mime-type>
+
+ <mime-type type="text/html">
+ <magic priority="50">
+ <match value="<!DOCTYPE HTML" type="string"
+ offset="0:64" />
+ <match value="<!doctype html" type="string"
+ offset="0:64" />
+ <match value="<HEAD" type="string" offset="0:64" />
+ <match value="<head" type="string" offset="0:64" />
+ <match value="<TITLE" type="string" offset="0:64" />
+ <match value="<title" type="string" offset="0:64" />
+ <match value="<html" type="string" offset="0:64" />
+ <match value="<HTML" type="string" offset="0:64" />
+ <match value="<BODY" type="string" offset="0" />
+ <match value="<body" type="string" offset="0" />
+ <match value="<TITLE" type="string" offset="0" />
+ <match value="<title" type="string" offset="0" />
+ <match value="<!--" type="string" offset="0" />
+ <match value="<h1" type="string" offset="0" />
+ <match value="<H1" type="string" offset="0" />
+ <match value="<!doctype HTML" type="string" offset="0" />
+ <match value="<!DOCTYPE html" type="string" offset="0" />
+ </magic>
+ <glob pattern="*.html" />
+ <glob pattern="*.htm" />
+ </mime-type>
+
+ <mime-type type="application/xhtml+xml">
+ <sub-class-of type="text/xml" />
+ <glob pattern="*.xhtml" />
+ <root-XML namespaceURI='http://www.w3.org/1999/xhtml'
+ localName='html' />
+ </mime-type>
+
+ <mime-type type="application/vnd.ms-powerpoint">
+ <glob pattern="*.ppz" />
+ <glob pattern="*.ppt" />
+ <glob pattern="*.pps" />
+ <glob pattern="*.pot" />
+ <magic priority="50">
+ <match value="0xcfd0e011" type="little32" offset="0" />
+ </magic>
+ </mime-type>
+
+ <mime-type type="application/vnd.ms-excel">
+ <magic priority="50">
+ <match value="Microsoft Excel 5.0 Worksheet" type="string"
+ offset="2080" />
+ </magic>
+ <glob pattern="*.xls" />
+ <glob pattern="*.xlc" />
+ <glob pattern="*.xll" />
+ <glob pattern="*.xlm" />
+ <glob pattern="*.xlw" />
+ <glob pattern="*.xla" />
+ <glob pattern="*.xlt" />
+ <glob pattern="*.xld" />
+ <alias type="application/msexcel" />
+ </mime-type>
+
+ <mime-type type="application/vnd.oasis.opendocument.text">
+ <glob pattern="*.odt" />
+ </mime-type>
+
+
+ <mime-type type="application/zip">
+ <alias type="application/x-zip-compressed" />
+ <magic priority="40">
+ <match value="PK\003\004" type="string" offset="0" />
+ </magic>
+ <glob pattern="*.zip" />
+ </mime-type>
+
+ <mime-type type="application/vnd.oasis.opendocument.text">
+ <glob pattern="*.oth" />
+ </mime-type>
+
+ <mime-type type="application/msword">
+ <magic priority="50">
+ <match value="\x31\xbe\x00\x00" type="string" offset="0" />
+ <match value="PO^Q`" type="string" offset="0" />
+ <match value="\376\067\0\043" type="string" offset="0" />
+ <match value="\333\245-\0\0\0" type="string" offset="0" />
+ <match value="Microsoft Word 6.0 Document" type="string"
+ offset="2080" />
+ <match value="Microsoft Word document data" type="string"
+ offset="2112" />
+ </magic>
+ <glob pattern="*.doc" />
+ <alias type="application/vnd.ms-word" />
+ </mime-type>
+
+ <mime-type type="application/octet-stream">
+ <magic priority="50">
+ <match value="\037\036" type="string" offset="0" />
+ <match value="017437" type="host16" offset="0" />
+ <match value="0x1fff" type="host16" offset="0" />
+ <match value="\377\037" type="string" offset="0" />
+ <match value="0145405" type="host16" offset="0" />
+ </magic>
+ <glob pattern="*.bin" />
+ </mime-type>
+
+ <mime-type type="application/pdf">
+ <magic priority="50">
+ <match value="%PDF-" type="string" offset="0" />
+ </magic>
+ <glob pattern="*.pdf" />
+ <alias type="application/x-pdf" />
+ </mime-type>
+
+ <mime-type type="application/atom+xml">
+ <root-XML localName="feed"
+ namespaceURI="http://purl.org/atom/ns#" />
+ </mime-type>
+
+ <mime-type type="application/mac-binhex40">
+ <glob pattern="*.hqx" />
+ </mime-type>
+
+ <mime-type type="application/mac-compactpro">
+ <glob pattern="*.cpt" />
+ </mime-type>
+
+ <mime-type type="application/rtf">
+ <glob pattern="*.rtf"/>
+ <alias type="text/rtf" />
+ </mime-type>
+
+ <mime-type type="application/rss+xml">
+ <alias type="text/rss" />
+ <root-XML localName="rss" />
+ <root-XML namespaceURI="http://purl.org/rss/1.0/" />
+ <glob pattern="*.rss" />
+ </mime-type>
+
+ <!-- added in by mattmann -->
+ <mime-type type="application/xml">
+ <alias type="text/xml" />
+ <glob pattern="*.xml" />
+ </mime-type>
+
+ <mime-type type="application/x-mif">
+ <alias type="application/vnd.mif" />
+ </mime-type>
+
+ <mime-type type="application/vnd.wap.wbxml">
+ <glob pattern="*.wbxml" />
+ </mime-type>
+
+ <mime-type type="application/vnd.wap.wmlc">
+ <_comment>Compiled WML Document</_comment>
+ <glob pattern="*.wmlc" />
+ </mime-type>
+
+ <mime-type type="application/vnd.wap.wmlscriptc">
+ <_comment>Compiled WML Script</_comment>
+ <glob pattern="*.wmlsc" />
+ </mime-type>
+
+ <mime-type type="text/vnd.wap.wmlscript">
+ <_comment>WML Script</_comment>
+ <glob pattern="*.wmls" />
+ </mime-type>
+
+ <mime-type type="application/x-bzip">
+ <alias type="application/x-bzip2" />
+ </mime-type>
+
+ <mime-type type="application/x-bzip-compressed-tar">
+ <glob pattern="*.tbz" />
+ <glob pattern="*.tbz2" />
+ </mime-type>
+
+ <mime-type type="application/x-cdlink">
+ <_comment>Virtual CD-ROM CD Image File</_comment>
+ <glob pattern="*.vcd" />
+ </mime-type>
+
+ <mime-type type="application/x-director">
+ <_comment>Shockwave Movie</_comment>
+ <glob pattern="*.dcr" />
+ <glob pattern="*.dir" />
+ <glob pattern="*.dxr" />
+ </mime-type>
+
+ <mime-type type="application/x-futuresplash">
+ <_comment>Macromedia FutureSplash File</_comment>
+ <glob pattern="*.spl" />
+ </mime-type>
+
+ <mime-type type="application/x-java">
+ <alias type="application/java" />
+ </mime-type>
+
+ <mime-type type="application/x-koan">
+ <_comment>SSEYO Koan File</_comment>
+ <glob pattern="*.skp" />
+ <glob pattern="*.skd" />
+ <glob pattern="*.skt" />
+ <glob pattern="*.skm" />
+ </mime-type>
+
+ <mime-type type="application/x-latex">
+ <_comment>LaTeX Source Document</_comment>
+ <glob pattern="*.latex" />
+ </mime-type>
+
+ <!-- JC CHANGED
+ <mime-type type="application/x-mif">
+ <_comment>FrameMaker MIF document</_comment>
+ <glob pattern="*.mif"/>
+ </mime-type> -->
+
+ <mime-type type="application/x-ms-dos-executable">
+ <alias type="application/x-dosexec;exe" />
+ </mime-type>
+
+ <mime-type type="application/ogg">
+ <alias type="application/x-ogg" />
+ </mime-type>
+
+ <mime-type type="application/x-rar">
+ <alias type="application/x-rar-compressed" />
+ </mime-type>
+
+ <mime-type type="application/x-shellscript">
+ <alias type="application/x-sh" />
+ </mime-type>
+
+ <mime-type type="application/xhtml+xml">
+ <glob pattern="*.xht" />
+ </mime-type>
+
+ <mime-type type="audio/midi">
+ <glob pattern="*.kar" />
+ </mime-type>
+
+ <mime-type type="audio/x-pn-realaudio">
+ <alias type="audio/x-realaudio" />
+ </mime-type>
+
+ <mime-type type="image/tiff">
+ <magic priority="50">
+ <match value="0x4d4d2a00" type="string" offset="0" />
+ <match value="0x49492a00" type="string" offset="0" />
+ </magic>
+ </mime-type>
+
+ <mime-type type="message/rfc822">
+ <magic priority="50">
+ <match type="string" value="Relay-Version:" offset="0" />
+ <match type="string" value="#! rnews" offset="0" />
+ <match type="string" value="N#! rnews" offset="0" />
+ <match type="string" value="Forward to" offset="0" />
+ <match type="string" value="Pipe to" offset="0" />
+ <match type="string" value="Return-Path:" offset="0" />
+ <match type="string" value="From:" offset="0" />
+ <match type="string" value="Message-ID:" offset="0" />
+ <match type="string" value="Date:" offset="0" />
+ </magic>
+ </mime-type>
+
+ <mime-type type="application/x-javascript">
+ <glob pattern="*.js" />
+ </mime-type>
+
+
+ <mime-type type="image/vnd.wap.wbmp">
+ <_comment>Wireless Bitmap File Format</_comment>
+ <glob pattern="*.wbmp" />
+ </mime-type>
+
+ <mime-type type="image/x-psd">
+ <alias type="image/photoshop" />
+ </mime-type>
+
+ <mime-type type="image/x-xcf">
+ <alias type="image/xcf" />
+ <magic priority="50">
+ <match type="string" value="gimp xcf " offset="0" />
+ </magic>
+ </mime-type>
+
+ <mime-type type="application/x-shockwave-flash">
+ <glob pattern="*.swf"/>
+ <magic priority="50">
+ <match type="string" value="FWS" offset="0"/>
+ <match type="string" value="CWS" offset="0"/>
+ </magic>
+ </mime-type>
+
+ <mime-type type="model/iges">
+ <_comment>
+ Initial Graphics Exchange Specification Format
+ </_comment>
+ <glob pattern="*.igs" />
+ <glob pattern="*.iges" />
+ </mime-type>
+
+ <mime-type type="model/mesh">
+ <glob pattern="*.msh" />
+ <glob pattern="*.mesh" />
+ <glob pattern="*.silo" />
+ </mime-type>
+
+ <mime-type type="model/vrml">
+ <glob pattern="*.vrml" />
+ </mime-type>
+
+ <mime-type type="text/x-tcl">
+ <alias type="application/x-tcl" />
+ </mime-type>
+
+ <mime-type type="text/x-tex">
+ <alias type="application/x-tex" />
+ </mime-type>
+
+ <mime-type type="text/x-texinfo">
+ <alias type="application/x-texinfo" />
+ </mime-type>
+
+ <mime-type type="text/x-troff-me">
+ <alias type="application/x-troff-me" />
+ </mime-type>
+
+ <mime-type type="video/vnd.mpegurl">
+ <glob pattern="*.mxu" />
+ </mime-type>
+
+ <mime-type type="x-conference/x-cooltalk">
+ <_comment>Cooltalk Audio</_comment>
+ <glob pattern="*.ice" />
+ </mime-type>
+
+</mime-info>
Added: lucene/nutch/trunk/lib/tika-0.1-dev.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-0.1-dev.jar?rev=583016&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/tika-0.1-dev.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Mon Oct 8 17:23:38 2007
@@ -36,8 +36,10 @@
import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.LogUtil;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypeException;
+
+// Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
/** Creates and caches {@link Parser} plugins.*/
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Oct 8 17:23:38 2007
@@ -17,6 +17,7 @@
package org.apache.nutch.protocol;
+//JDK imports
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
@@ -25,6 +26,7 @@
import java.util.Arrays;
import java.util.zip.InflaterInputStream;
+//Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -33,11 +35,16 @@
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
+
+//Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypeException;
-import org.apache.nutch.util.mime.MimeTypes;
+
+//Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeUtils;
+
public final class Content implements Writable{
@@ -59,7 +66,7 @@
private boolean mimeTypeMagic;
- private MimeTypes mimeTypes;
+ private static MimeUtils mimeTypes;
public Content() {
metadata = new Metadata();
@@ -82,7 +89,9 @@
this.content = content;
this.metadata = metadata;
this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
- this.mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
+ if(this.mimeTypes == null){
+ this.mimeTypes = new MimeUtils(conf.get("mime.types.file"), this.mimeTypeMagic);
+ }
this.contentType = getContentType(contentType, url, content);
}
@@ -281,28 +290,40 @@
private String getContentType(String typeName, String url, byte[] data) {
MimeType type = null;
+ String cleanedMimeType = null;
+
try {
- typeName = MimeType.clean(typeName);
- type = typeName == null ? null : this.mimeTypes.forName(typeName);
+ cleanedMimeType = MimeType.clean(typeName);
} catch (MimeTypeException mte) {
// Seems to be a malformed mime type name...
}
- if (typeName == null || type == null || !type.matches(url)) {
+ // first try to get the type from the cleaned type name
+ type = cleanedMimeType != null ? this.mimeTypes.getRepository().forName(
+ cleanedMimeType) : null;
+
+ // if returned null, then try url resolution
+ if (type == null) {
// If no mime-type header, or cannot find a corresponding registered
- // mime-type, or the one found doesn't match the url pattern
- // it shouldbe, then guess a mime-type from the url pattern
- type = this.mimeTypes.getMimeType(url);
- typeName = type == null ? typeName : type.getName();
+ // mime-type, then guess a mime-type from the url pattern
+ type = this.mimeTypes.getRepository().getMimeType(url) != null ? this.mimeTypes
+ .getRepository().getMimeType(url)
+ : type;
}
- if (typeName == null || type == null
- || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
- // If no mime-type already found, or the one found doesn't match
- // the magic bytes it should be, then, guess a mime-type from the
- // document content (magic bytes)
- type = this.mimeTypes.getMimeType(data);
- typeName = type == null ? typeName : type.getName();
+
+ // if magic is enabled use mime magic to guess if the mime type returned
+ // from the magic guess is different than the one that's already set so far
+ // if it is, go with the mime type returned by the magic
+ if (this.mimeTypeMagic) {
+ MimeType magicType = this.mimeTypes.getRepository().getMimeType(data);
+ if (magicType != null && !type.getName().equals(magicType.getName())) {
+ // If magic enabled and the current mime type differs from that of the
+ // one returned from the magic, take the magic mimeType
+
+ type = magicType;
+ }
}
- return typeName;
+
+ return type.getName();
}
}
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Mon Oct 8 17:23:38 2007
@@ -23,6 +23,9 @@
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -44,10 +47,6 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
-import org.apache.nutch.util.mime.MimeTypeException;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -80,7 +79,7 @@
private boolean MAGIC;
/** Get the MimeTypes resolver instance. */
- private MimeTypes MIME;
+ private static MimeUtils MIME;
public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -194,7 +193,7 @@
// } else {
// contentType = MIME.getMimeType(url);
// }
- mimeType = MIME.getMimeType(url);
+ mimeType = MIME.getRepository().getMimeType(url);
} else {
try {
mimeType = new MimeType(contentType);
@@ -281,7 +280,8 @@
public void setConf(Configuration conf) {
this.conf = conf;
MAGIC = conf.getBoolean("mime.type.magic", true);
- MIME = MimeTypes.get(getConf().get("mime.types.file"));
+ if(MIME == null)
+ MIME = new MimeUtils(getConf().get("mime.types.file"), MAGIC);
}
public Configuration getConf() {
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Mon Oct 8 17:23:38 2007
@@ -42,7 +42,10 @@
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.mime.MimeTypes;
+
+// Tika imports
+import org.apache.tika.mime.MimeUtils;
+
/**
@@ -52,17 +55,18 @@
public class ZipTextExtractor {
/** Get the MimeTypes resolver instance. */
- private MimeTypes MIME;
+ private static MimeUtils MIME;
public static final Log LOG = LogFactory.getLog(ZipTextExtractor.class);
-private Configuration conf;
+ private Configuration conf;
/** Creates a new instance of ZipTextExtractor */
public ZipTextExtractor(Configuration conf) {
this.conf = conf;
- this.MIME = MimeTypes.get(conf.get("mime.types.file"));
+ if(this.MIME == null)
+ this.MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
}
public String extractText(InputStream input, String url, List outLinksList) throws IOException {
@@ -92,7 +96,7 @@
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
- String contentType = MIME.getMimeType(fname).getName();
+ String contentType = MIME.getRepository().getMimeType(fname).getName();
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Mon Oct 8 17:23:38 2007
@@ -26,12 +26,14 @@
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
+// Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeUtils;
+
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
@@ -73,6 +75,8 @@
private final File file;
private Configuration conf;
+
+ private static MimeUtils MIME;
/** Returns the response code. */
public int getCode() { return code; }
@@ -97,6 +101,9 @@
this.base = url.toString();
this.file = file;
this.conf = conf;
+
+ if(MIME == null)
+ MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
if (!"file".equals(url.getProtocol()))
throw new FileException("Not a file url:" + url);
@@ -202,8 +209,8 @@
headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
.lastModified()));
- MimeTypes mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
- MimeType mimeType = mimeTypes.getMimeType(f);
+
+ MimeType mimeType = MIME.getRepository().getMimeType(f);
String mimeTypeString = mimeType != null ? mimeType.getName() : "";
headers.set(Response.CONTENT_TYPE, mimeTypeString);
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Oct 8 17:23:38 2007
@@ -22,6 +22,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.WritableTestUtils;
+import org.apache.tika.mime.MimeTypes;
import junit.framework.TestCase;
@@ -98,13 +99,13 @@
"http://www.foo.com/",
"".getBytes("UTF8"),
"", p, conf);
- assertEquals("", c.getContentType());
+ assertEquals(MimeTypes.DEFAULT, c.getContentType());
c = new Content("http://www.foo.com/",
"http://www.foo.com/",
"".getBytes("UTF8"),
null, p, conf);
- assertNull(c.getContentType());
+ assertNotNull(c.getContentType());
}
}