You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2007/10/09 02:23:39 UTC

svn commit: r583016 - in /lucene/nutch/trunk: ./ conf/ lib/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/ src/java/org/apache/nutch/util/mime/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/parse-zip/src...

Author: mattmann
Date: Mon Oct  8 17:23:38 2007
New Revision: 583016

URL: http://svn.apache.org/viewvc?rev=583016&view=rev
Log:
- fix for NUTCH-562

Added:
    lucene/nutch/trunk/conf/tika-mimetypes.xml
    lucene/nutch/trunk/lib/tika-0.1-dev.jar   (with props)
Removed:
    lucene/nutch/trunk/conf/mime-types.dtd
    lucene/nutch/trunk/conf/mime-types.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/util/mime/
    lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
    lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
    lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Oct  8 17:23:38 2007
@@ -144,6 +144,9 @@
 
 49. NUTCH-508 - ${hadoop.log.dir} and ${hadoop.log.file} are not propagated
     to the tasktracker. (Mathijs Homminga, Emmanuel Joke via dogacan)
+    
+50. NUTCH-562 - Port mime type framework to use Tika mime detection framework.
+    (mattmann)
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Oct  8 17:23:38 2007
@@ -784,7 +784,7 @@
 
 <property>
   <name>mime.types.file</name>
-  <value>mime-types.xml</value>
+  <value>tika-mimetypes.xml</value>
   <description>Name of file in CLASSPATH containing filename extension and
   magic sequence to mime types mapping information</description>
 </property>

Added: lucene/nutch/trunk/conf/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=583016&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/tika-mimetypes.xml (added)
+++ lucene/nutch/trunk/conf/tika-mimetypes.xml Mon Oct  8 17:23:38 2007
@@ -0,0 +1,368 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+	
+	Description: This xml file defines the valid mime types used by Tika.
+	The mime types within this file are based on the types in the mime-types.xml 
+	file available in Apache Nutch.
+-->
+
+<mime-info>
+
+	<mime-type type="text/plain">
+		<magic priority="50">
+			<match value="This is TeX," type="string" offset="0" />
+			<match value="This is METAFONT," type="string" offset="0" />
+		</magic>
+		<glob pattern="*.txt" />
+		<glob pattern="*.asc" />
+	</mime-type>
+
+	<mime-type type="text/html">
+		<magic priority="50">
+			<match value="&lt;!DOCTYPE HTML" type="string"
+				offset="0:64" />
+			<match value="&lt;!doctype html" type="string"
+				offset="0:64" />
+			<match value="&lt;HEAD" type="string" offset="0:64" />
+			<match value="&lt;head" type="string" offset="0:64" />
+			<match value="&lt;TITLE" type="string" offset="0:64" />
+			<match value="&lt;title" type="string" offset="0:64" />
+			<match value="&lt;html" type="string" offset="0:64" />
+			<match value="&lt;HTML" type="string" offset="0:64" />
+			<match value="&lt;BODY" type="string" offset="0" />
+			<match value="&lt;body" type="string" offset="0" />
+			<match value="&lt;TITLE" type="string" offset="0" />
+			<match value="&lt;title" type="string" offset="0" />
+			<match value="&lt;!--" type="string" offset="0" />
+			<match value="&lt;h1" type="string" offset="0" />
+			<match value="&lt;H1" type="string" offset="0" />
+			<match value="&lt;!doctype HTML" type="string" offset="0" />
+			<match value="&lt;!DOCTYPE html" type="string" offset="0" />
+		</magic>
+		<glob pattern="*.html" />
+		<glob pattern="*.htm" />
+	</mime-type>
+
+	<mime-type type="application/xhtml+xml">
+		<sub-class-of type="text/xml" />
+		<glob pattern="*.xhtml" />
+		<root-XML namespaceURI='http://www.w3.org/1999/xhtml'
+			localName='html' />
+	</mime-type>
+
+	<mime-type type="application/vnd.ms-powerpoint">
+		<glob pattern="*.ppz" />
+		<glob pattern="*.ppt" />
+		<glob pattern="*.pps" />
+		<glob pattern="*.pot" />
+		<magic priority="50">
+			<match value="0xcfd0e011" type="little32" offset="0" />
+		</magic>
+	</mime-type>
+
+	<mime-type type="application/vnd.ms-excel">
+		<magic priority="50">
+			<match value="Microsoft Excel 5.0 Worksheet" type="string"
+				offset="2080" />
+		</magic>
+		<glob pattern="*.xls" />
+		<glob pattern="*.xlc" />
+		<glob pattern="*.xll" />
+		<glob pattern="*.xlm" />
+		<glob pattern="*.xlw" />
+		<glob pattern="*.xla" />
+		<glob pattern="*.xlt" />
+		<glob pattern="*.xld" />
+		<alias type="application/msexcel" />
+	</mime-type>
+
+	<mime-type type="application/vnd.oasis.opendocument.text">
+		<glob pattern="*.odt" />
+	</mime-type>
+
+
+	<mime-type type="application/zip">
+		<alias type="application/x-zip-compressed" />
+		<magic priority="40">
+			<match value="PK\003\004" type="string" offset="0" />
+		</magic>
+		<glob pattern="*.zip" />
+	</mime-type>
+
+	<mime-type type="application/vnd.oasis.opendocument.text">
+		<glob pattern="*.oth" />
+	</mime-type>
+
+	<mime-type type="application/msword">
+		<magic priority="50">
+			<match value="\x31\xbe\x00\x00" type="string" offset="0" />
+			<match value="PO^Q`" type="string" offset="0" />
+			<match value="\376\067\0\043" type="string" offset="0" />
+			<match value="\333\245-\0\0\0" type="string" offset="0" />
+			<match value="Microsoft Word 6.0 Document" type="string"
+				offset="2080" />
+			<match value="Microsoft Word document data" type="string"
+				offset="2112" />
+		</magic>
+		<glob pattern="*.doc" />
+		<alias type="application/vnd.ms-word" />
+	</mime-type>
+
+	<mime-type type="application/octet-stream">
+		<magic priority="50">
+			<match value="\037\036" type="string" offset="0" />
+			<match value="017437" type="host16" offset="0" />
+			<match value="0x1fff" type="host16" offset="0" />
+			<match value="\377\037" type="string" offset="0" />
+			<match value="0145405" type="host16" offset="0" />
+		</magic>
+		<glob pattern="*.bin" />
+	</mime-type>
+
+	<mime-type type="application/pdf">
+		<magic priority="50">
+			<match value="%PDF-" type="string" offset="0" />
+		</magic>
+		<glob pattern="*.pdf" />
+		<alias type="application/x-pdf" />
+	</mime-type>
+
+	<mime-type type="application/atom+xml">
+		<root-XML localName="feed"
+			namespaceURI="http://purl.org/atom/ns#" />
+	</mime-type>
+
+	<mime-type type="application/mac-binhex40">
+		<glob pattern="*.hqx" />
+	</mime-type>
+
+	<mime-type type="application/mac-compactpro">
+		<glob pattern="*.cpt" />
+	</mime-type>
+
+	<mime-type type="application/rtf">
+	    <glob pattern="*.rtf"/>
+		<alias type="text/rtf" />
+	</mime-type>
+
+	<mime-type type="application/rss+xml">
+		<alias type="text/rss" />
+		<root-XML localName="rss" />
+		<root-XML namespaceURI="http://purl.org/rss/1.0/" />
+		<glob pattern="*.rss" />
+	</mime-type>
+
+	<!--  added in by mattmann -->
+	<mime-type type="application/xml">
+		<alias type="text/xml" />
+		<glob pattern="*.xml" />
+	</mime-type>
+
+	<mime-type type="application/x-mif">
+		<alias type="application/vnd.mif" />
+	</mime-type>
+
+	<mime-type type="application/vnd.wap.wbxml">
+		<glob pattern="*.wbxml" />
+	</mime-type>
+
+	<mime-type type="application/vnd.wap.wmlc">
+		<_comment>Compiled WML Document</_comment>
+		<glob pattern="*.wmlc" />
+	</mime-type>
+
+	<mime-type type="application/vnd.wap.wmlscriptc">
+		<_comment>Compiled WML Script</_comment>
+		<glob pattern="*.wmlsc" />
+	</mime-type>
+
+	<mime-type type="text/vnd.wap.wmlscript">
+		<_comment>WML Script</_comment>
+		<glob pattern="*.wmls" />
+	</mime-type>
+
+	<mime-type type="application/x-bzip">
+		<alias type="application/x-bzip2" />
+	</mime-type>
+
+	<mime-type type="application/x-bzip-compressed-tar">
+		<glob pattern="*.tbz" />
+		<glob pattern="*.tbz2" />
+	</mime-type>
+
+	<mime-type type="application/x-cdlink">
+		<_comment>Virtual CD-ROM CD Image File</_comment>
+		<glob pattern="*.vcd" />
+	</mime-type>
+
+	<mime-type type="application/x-director">
+		<_comment>Shockwave Movie</_comment>
+		<glob pattern="*.dcr" />
+		<glob pattern="*.dir" />
+		<glob pattern="*.dxr" />
+	</mime-type>
+
+	<mime-type type="application/x-futuresplash">
+		<_comment>Macromedia FutureSplash File</_comment>
+		<glob pattern="*.spl" />
+	</mime-type>
+
+	<mime-type type="application/x-java">
+		<alias type="application/java" />
+	</mime-type>
+
+	<mime-type type="application/x-koan">
+		<_comment>SSEYO Koan File</_comment>
+		<glob pattern="*.skp" />
+		<glob pattern="*.skd" />
+		<glob pattern="*.skt" />
+		<glob pattern="*.skm" />
+	</mime-type>
+
+	<mime-type type="application/x-latex">
+		<_comment>LaTeX Source Document</_comment>
+		<glob pattern="*.latex" />
+	</mime-type>
+
+	<!-- JC CHANGED
+		<mime-type type="application/x-mif">
+		<_comment>FrameMaker MIF document</_comment>
+		<glob pattern="*.mif"/>
+		</mime-type> -->
+
+	<mime-type type="application/x-ms-dos-executable">
+		<alias type="application/x-dosexec;exe" />
+	</mime-type>
+
+	<mime-type type="application/ogg">
+		<alias type="application/x-ogg" />
+	</mime-type>
+
+	<mime-type type="application/x-rar">
+		<alias type="application/x-rar-compressed" />
+	</mime-type>
+
+	<mime-type type="application/x-shellscript">
+		<alias type="application/x-sh" />
+	</mime-type>
+
+	<mime-type type="application/xhtml+xml">
+		<glob pattern="*.xht" />
+	</mime-type>
+
+	<mime-type type="audio/midi">
+		<glob pattern="*.kar" />
+	</mime-type>
+
+	<mime-type type="audio/x-pn-realaudio">
+		<alias type="audio/x-realaudio" />
+	</mime-type>
+
+	<mime-type type="image/tiff">
+		<magic priority="50">
+			<match value="0x4d4d2a00" type="string" offset="0" />
+			<match value="0x49492a00" type="string" offset="0" />
+		</magic>
+	</mime-type>
+
+	<mime-type type="message/rfc822">
+		<magic priority="50">
+			<match type="string" value="Relay-Version:" offset="0" />
+			<match type="string" value="#! rnews" offset="0" />
+			<match type="string" value="N#! rnews" offset="0" />
+			<match type="string" value="Forward to" offset="0" />
+			<match type="string" value="Pipe to" offset="0" />
+			<match type="string" value="Return-Path:" offset="0" />
+			<match type="string" value="From:" offset="0" />
+			<match type="string" value="Message-ID:" offset="0" />
+			<match type="string" value="Date:" offset="0" />
+		</magic>
+	</mime-type>
+	
+	<mime-type type="application/x-javascript">
+        <glob pattern="*.js" />
+    </mime-type>
+    
+
+	<mime-type type="image/vnd.wap.wbmp">
+		<_comment>Wireless Bitmap File Format</_comment>
+		<glob pattern="*.wbmp" />
+	</mime-type>
+
+	<mime-type type="image/x-psd">
+		<alias type="image/photoshop" />
+	</mime-type>
+
+	<mime-type type="image/x-xcf">
+		<alias type="image/xcf" />
+		<magic priority="50">
+			<match type="string" value="gimp xcf " offset="0" />
+		</magic>
+	</mime-type>
+	
+	<mime-type type="application/x-shockwave-flash">
+      <glob pattern="*.swf"/>
+      <magic priority="50">
+        <match type="string" value="FWS" offset="0"/>
+        <match type="string" value="CWS" offset="0"/>
+      </magic>
+    </mime-type>
+
+	<mime-type type="model/iges">
+		<_comment>
+			Initial Graphics Exchange Specification Format
+		</_comment>
+		<glob pattern="*.igs" />
+		<glob pattern="*.iges" />
+	</mime-type>
+
+	<mime-type type="model/mesh">
+		<glob pattern="*.msh" />
+		<glob pattern="*.mesh" />
+		<glob pattern="*.silo" />
+	</mime-type>
+
+	<mime-type type="model/vrml">
+		<glob pattern="*.vrml" />
+	</mime-type>
+
+	<mime-type type="text/x-tcl">
+		<alias type="application/x-tcl" />
+	</mime-type>
+
+	<mime-type type="text/x-tex">
+		<alias type="application/x-tex" />
+	</mime-type>
+
+	<mime-type type="text/x-texinfo">
+		<alias type="application/x-texinfo" />
+	</mime-type>
+
+	<mime-type type="text/x-troff-me">
+		<alias type="application/x-troff-me" />
+	</mime-type>
+
+	<mime-type type="video/vnd.mpegurl">
+		<glob pattern="*.mxu" />
+	</mime-type>
+
+	<mime-type type="x-conference/x-cooltalk">
+		<_comment>Cooltalk Audio</_comment>
+		<glob pattern="*.ice" />
+	</mime-type>
+
+</mime-info>

Added: lucene/nutch/trunk/lib/tika-0.1-dev.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-0.1-dev.jar?rev=583016&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/tika-0.1-dev.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Mon Oct  8 17:23:38 2007
@@ -36,8 +36,10 @@
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.util.LogUtil;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypeException;
+
+// Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
 
 
 /** Creates and caches {@link Parser} plugins.*/

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Oct  8 17:23:38 2007
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.protocol;
 
+//JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
@@ -25,6 +26,7 @@
 import java.util.Arrays;
 import java.util.zip.InflaterInputStream;
 
+//Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -33,11 +35,16 @@
 import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.hadoop.io.Writable;
+
+//Nutch imports
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypeException;
-import org.apache.nutch.util.mime.MimeTypes;
+
+//Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeUtils;
+
 
 public final class Content implements Writable{
 
@@ -59,7 +66,7 @@
 
   private boolean mimeTypeMagic;
 
-  private MimeTypes mimeTypes;
+  private static MimeUtils mimeTypes;
 
   public Content() {
     metadata = new Metadata();
@@ -82,7 +89,9 @@
     this.content = content;
     this.metadata = metadata;
     this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
-    this.mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
+    if(this.mimeTypes == null){
+      this.mimeTypes = new MimeUtils(conf.get("mime.types.file"), this.mimeTypeMagic);
+    }
     this.contentType = getContentType(contentType, url, content);
   }
 
@@ -281,28 +290,40 @@
 
   private String getContentType(String typeName, String url, byte[] data) {
     MimeType type = null;
+    String cleanedMimeType = null;
+
     try {
-      typeName = MimeType.clean(typeName);
-      type = typeName == null ? null : this.mimeTypes.forName(typeName);
+      cleanedMimeType = MimeType.clean(typeName);
     } catch (MimeTypeException mte) {
       // Seems to be a malformed mime type name...
     }
 
-    if (typeName == null || type == null || !type.matches(url)) {
+    // first try to get the type from the cleaned type name
+    type = cleanedMimeType != null ? this.mimeTypes.getRepository().forName(
+        cleanedMimeType) : null;
+
+    // if returned null, then try url resolution
+    if (type == null) {
       // If no mime-type header, or cannot find a corresponding registered
-      // mime-type, or the one found doesn't match the url pattern
-      // it shouldbe, then guess a mime-type from the url pattern
-      type = this.mimeTypes.getMimeType(url);
-      typeName = type == null ? typeName : type.getName();
+      // mime-type, then guess a mime-type from the url pattern
+      type = this.mimeTypes.getRepository().getMimeType(url) != null ? this.mimeTypes
+          .getRepository().getMimeType(url)
+          : type;
     }
-    if (typeName == null || type == null
-        || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
-      // If no mime-type already found, or the one found doesn't match
-      // the magic bytes it should be, then, guess a mime-type from the
-      // document content (magic bytes)
-      type = this.mimeTypes.getMimeType(data);
-      typeName = type == null ? typeName : type.getName();
+
+    // if magic is enabled use mime magic to guess if the mime type returned
+    // from the magic guess is different than the one that's already set so far
+    // if it is, go with the mime type returned by the magic
+    if (this.mimeTypeMagic) {
+      MimeType magicType = this.mimeTypes.getRepository().getMimeType(data);
+      if (magicType != null && !type.getName().equals(magicType.getName())) {
+        // If magic enabled and the current mime type differs from that of the
+        // one returned from the magic, take the magic mimeType
+
+        type = magicType;
+      }
     }
-    return typeName;
+
+    return type.getName();
   }
 }

Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Mon Oct  8 17:23:38 2007
@@ -23,6 +23,9 @@
 import org.apache.oro.text.regex.PatternMatcher;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeUtils;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -44,10 +47,6 @@
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.parse.ParseData;
 
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
-import org.apache.nutch.util.mime.MimeTypeException;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
@@ -80,7 +79,7 @@
   private boolean MAGIC;
 
   /** Get the MimeTypes resolver instance. */
-  private MimeTypes MIME; 
+  private static MimeUtils MIME; 
   
   public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
@@ -194,7 +193,7 @@
         // } else {
         //   contentType = MIME.getMimeType(url);
         // }
-        mimeType = MIME.getMimeType(url);
+        mimeType = MIME.getRepository().getMimeType(url);
     } else {
         try {
             mimeType = new MimeType(contentType);
@@ -281,7 +280,8 @@
   public void setConf(Configuration conf) {
     this.conf = conf;
     MAGIC = conf.getBoolean("mime.type.magic", true);
-    MIME = MimeTypes.get(getConf().get("mime.types.file"));
+    if(MIME == null)
+      MIME = new MimeUtils(getConf().get("mime.types.file"), MAGIC);
   }
 
   public Configuration getConf() {

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Mon Oct  8 17:23:38 2007
@@ -42,7 +42,10 @@
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.mime.MimeTypes;
+
+// Tika imports
+import org.apache.tika.mime.MimeUtils;
+
 
 
 /**
@@ -52,17 +55,18 @@
 public class ZipTextExtractor {
   
   /** Get the MimeTypes resolver instance. */
-  private MimeTypes MIME;
+  private static MimeUtils MIME;
   
   public static final Log LOG = LogFactory.getLog(ZipTextExtractor.class);
 
-private Configuration conf;
+  private Configuration conf;
   
   
   /** Creates a new instance of ZipTextExtractor */
   public ZipTextExtractor(Configuration conf) {
       this.conf = conf;
-      this.MIME = MimeTypes.get(conf.get("mime.types.file"));
+      if(this.MIME == null)
+        this.MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
   }
   
   public String extractText(InputStream input, String url, List outLinksList) throws IOException {
@@ -92,7 +96,7 @@
         int i = fname.lastIndexOf('.');
         if (i != -1) {
           // Trying to resolve the Mime-Type
-          String contentType = MIME.getMimeType(fname).getName();
+          String contentType = MIME.getRepository().getMimeType(fname).getName();
           try {
             Metadata metadata = new Metadata();
             metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));

Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Mon Oct  8 17:23:38 2007
@@ -26,12 +26,14 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 
+// Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeUtils;
+
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 
@@ -73,6 +75,8 @@
 
   private final File file;
   private Configuration conf;
+  
+  private static MimeUtils MIME;
 
   /** Returns the response code. */
   public int getCode() { return code; }
@@ -97,6 +101,9 @@
     this.base = url.toString();
     this.file = file;
     this.conf = conf;
+    
+    if(MIME == null)
+      MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
 
     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);
@@ -202,8 +209,8 @@
     headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
     headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
         .lastModified()));
-    MimeTypes mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
-    MimeType mimeType = mimeTypes.getMimeType(f);
+    
+    MimeType mimeType = MIME.getRepository().getMimeType(f);
     String mimeTypeString = mimeType != null ? mimeType.getName() : "";
     headers.set(Response.CONTENT_TYPE, mimeTypeString);
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=583016&r1=583015&r2=583016&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Oct  8 17:23:38 2007
@@ -22,6 +22,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.WritableTestUtils;
+import org.apache.tika.mime.MimeTypes;
 
 import junit.framework.TestCase;
 
@@ -98,13 +99,13 @@
                     "http://www.foo.com/",
                     "".getBytes("UTF8"),
                     "", p, conf);
-    assertEquals("", c.getContentType());
+    assertEquals(MimeTypes.DEFAULT, c.getContentType());
 
     c = new Content("http://www.foo.com/",
                     "http://www.foo.com/",
                     "".getBytes("UTF8"),
                     null, p, conf);
-    assertNull(c.getContentType());
+    assertNotNull(c.getContentType());
   }
 
 }