You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/06/15 13:27:09 UTC

svn commit: r1350580 - in /nutch/branches/nutchgora: ./ ivy/ src/java/org/apache/nutch/util/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/parse-tika/src/java/org...

Author: lewismc
Date: Fri Jun 15 11:27:09 2012
New Revision: 1350580

URL: http://svn.apache.org/viewvc?rev=1350580&view=rev
Log:
Upgrade to Tika 1.1 NUTCH-1396

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/ivy/ivy.xml
    nutch/branches/nutchgora/src/java/org/apache/nutch/util/MimeUtil.java
    nutch/branches/nutchgora/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    nutch/branches/nutchgora/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    nutch/branches/nutchgora/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
    nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
    nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
    nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
    nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
    nutch/branches/nutchgora/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Fri Jun 15 11:27:09 2012
@@ -3,6 +3,8 @@ Nutch Change Log
 Release 2.0 (08/06/2012) ddmmyyy
 Full Jira report - https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=10680&version=12314893
 
+* NUTCH-1396 Upgrade Tika 1.1 (jnioche)
+
 * NUTCH-1392 -force and -resume arguments being ignored in ParserJob (ferdy via lewismc)
 
 * NUTCH-1379 NPE when reprUrl is null in ParseUtil (ferdy)

Modified: nutch/branches/nutchgora/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/ivy/ivy.xml?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/ivy/ivy.xml (original)
+++ nutch/branches/nutchgora/ivy/ivy.xml Fri Jun 15 11:27:09 2012
@@ -55,7 +55,7 @@
     </dependency>
 
     <dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
-    <dependency org="org.apache.tika" name="tika-core" rev="0.10" />
+    <dependency org="org.apache.tika" name="tika-core" rev="1.1" />
     <!-- 
     <dependency org="org.apache.tika" name="tika-parsers" rev="0.10"/> 
     -->

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/util/MimeUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/util/MimeUtil.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/util/MimeUtil.java Fri Jun 15 11:27:09 2012
@@ -24,13 +24,19 @@ import java.io.File;
 import org.apache.hadoop.conf.Configuration;
 
 // Tika imports
+import org.apache.tika.Tika;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput;
+
 /**
  * @author mattmann
  * @since NUTCH-608
@@ -49,6 +55,9 @@ public final class MimeUtil {
   /* our Tika mime type registry */
   private MimeTypes mimeTypes;
 
+  /* the tika detectors */
+  private Tika tika;
+
   /* whether or not magic should be employed or not */
   private boolean mimeMagic;
 
@@ -56,6 +65,7 @@ public final class MimeUtil {
   private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class.getName());
 
   public MimeUtil(Configuration conf) {
+    tika = new Tika();
     ObjectCache objectCache = ObjectCache.get(conf);
     MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
         .getName());
@@ -118,7 +128,7 @@ public final class MimeUtil {
    * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
    * Then the cleaned mime type is looked up in the underlying Tika
    * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
-   * found, then that mime type is used, otherwise {@link URL} resolution is
+   * found, then that mime type is used, otherwise URL resolution is
    * used to try and determine the mime type. If that means is unsuccessful, and
    * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
    * then mime type magic resolution is used to try and obtain a
@@ -127,12 +137,14 @@ public final class MimeUtil {
    * @param typeName
    *          The original mime type, returned from a {@link ProtocolOutput}.
    * @param url
-   *          The given {@link URL}, that Nutch was trying to crawl.
+   *          The given @see url, that Nutch was trying to crawl.
    * @param data
    *          The byte data, returned from the crawl, if any.
    * @return The correctly, automatically guessed {@link MimeType} name.
    */
   public String autoResolveContentType(String typeName, String url, byte[] data) {
+    String retType = null;
+    String magicType = null;
     MimeType type = null;
     String cleanedMimeType = null;
 
@@ -161,59 +173,65 @@ public final class MimeUtil {
           .getMimeType(url) : type;
     }
 
+    retType= type.getName();
+
     // if magic is enabled use mime magic to guess if the mime type returned
     // from the magic guess is different than the one that's already set so far
     // if it is, and it's not the default mime type, then go with the mime type
     // returned by the magic
     if (this.mimeMagic) {
-      MimeType magicType = this.mimeTypes.getMimeType(data);
-      if (magicType != null && !magicType.getName().equals(MimeTypes.OCTET_STREAM)
-          && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
-          && type != null && !type.getName().equals(magicType.getName())) {
+      magicType = tika.detect(data);
+
+      // Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230
+      //MimeType magicType = this.mimeTypes.getMimeType(data);
+      if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+          && !magicType.equals(MimeTypes.PLAIN_TEXT)
+          && retType != null && !retType.equals(magicType)) {
+
         // If magic enabled and the current mime type differs from that of the
         // one returned from the magic, take the magic mimeType
-        type = magicType;
+        retType = magicType;
       }
 
       // if type is STILL null after all the resolution strategies, go for the
       // default type
-      if (type == null) {
+      if (retType == null) {
         try {
-          type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
+          retType = MimeTypes.OCTET_STREAM;
         } catch (Exception ignore) {
         }
       }
     }
 
-    return type.getName();
+    return retType;
   }
 
   /**
    * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
    * method.
-   * 
+   *
    * @param url
    *          A string representation of the document {@link URL} to sense the
    *          {@link MimeType} for.
    * @return An appropriate {@link MimeType}, identified from the given
    *         Document url in string form.
    */
-  public MimeType getMimeType(String url) {
-    return this.mimeTypes.getMimeType(url);
+  public String getMimeType(String url) {
+    return tika.detect(url);
   }
 
   /**
    * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
    * method.
-   * 
+   *
    * @param name
    *          The name of a valid {@link MimeType} in the Tika mime registry.
    * @return The object representation of the {@link MimeType}, if it exists,
    *         or null otherwise.
    */
-  public MimeType forName(String name) {
+  public String forName(String name) {
     try {
-      return this.mimeTypes.forName(name);
+      return this.mimeTypes.forName(name).toString();
     } catch (MimeTypeException e) {
       LOG.error("Exception getting mime type by name: [" + name
           + "]: Message: " + e.getMessage());
@@ -224,14 +242,21 @@ public final class MimeUtil {
   /**
    * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
    * method.
-   * 
+   *
    * @param f
    *          The {@link File} to sense the {@link MimeType} for.
    * @return The {@link MimeType} of the given {@link File}, or null if it
    *         cannot be determined.
    */
-  public MimeType getMimeType(File f) {
-    return this.mimeTypes.getMimeType(f);
+  public String getMimeType(File f) {
+    try {
+      return tika.detect(f);
+    } catch (Exception e) {
+      LOG.error("Exception getting mime type for file: [" + f.getPath()
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
   }
 
+
 }

Modified: nutch/branches/nutchgora/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/branches/nutchgora/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Jun 15 11:27:09 2012
@@ -73,8 +73,8 @@ public class TestCCParseFilter extends T
 		page.setBaseUrl(new Utf8(url));
 		page.setContent(ByteBuffer.wrap(bytes));
 		MimeUtil mimeutil = new MimeUtil(conf);
-		MimeType mtype = mimeutil.getMimeType(file);
-		page.setContentType(new Utf8(mtype.getName()));
+		String mtype = mimeutil.getMimeType(file);
+		page.setContentType(new Utf8(mtype));
 
 		new ParseUtil(conf).parse(url, page);
 

Modified: nutch/branches/nutchgora/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/branches/nutchgora/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Fri Jun 15 11:27:09 2012
@@ -17,16 +17,12 @@
 package org.apache.nutch.indexer.more;
 
 import java.text.ParseException;
-import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
-import java.util.TimeZone;
 
 import org.apache.avro.util.Utf8;
 import org.apache.commons.lang.time.DateUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
@@ -42,8 +38,9 @@ import org.apache.oro.text.regex.Pattern
 import org.apache.oro.text.regex.Perl5Compiler;
 import org.apache.oro.text.regex.Perl5Matcher;
 import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.tika.mime.MimeType;
 import org.apache.solr.common.util.DateUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Add (or reset) a few metaData properties as respective fields (if they are
@@ -170,7 +167,7 @@ public class MoreIndexingFilter implemen
    * @return
    */
   private NutchDocument addType(NutchDocument doc, WebPage page, String url) {
-    MimeType mimeType = null;
+    String mimeType = null;
     Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
     if (contentType == null) {
       // Note by Jerome Charron on 20050415:
@@ -194,9 +191,9 @@ public class MoreIndexingFilter implemen
       return doc;
     }
 
-    String scontentType = mimeType.getName();
+    //String scontentType = mimeType.getName();
 
-    doc.add("type", scontentType);
+    doc.add("type", mimeType);
 
     // Check if we need to split the content type in sub parts
     if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {

Modified: nutch/branches/nutchgora/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Fri Jun 15 11:27:09 2012
@@ -242,13 +242,13 @@ public class TikaParser implements org.a
     page.setBaseUrl(new Utf8(url));
     page.setContent(ByteBuffer.wrap(bytes));
     MimeUtil mimeutil = new MimeUtil(conf);
-    MimeType mtype = mimeutil.getMimeType(file);
-    page.setContentType(new Utf8(mtype.getName()));
+    String mtype = mimeutil.getMimeType(file);
+    page.setContentType(new Utf8(mtype));
     // Parse parse = parser.getParse(url, page);
 
     Parse parse = new ParseUtil(conf).parse(url, page);
 
-    System.out.println("content type: " + mtype.getName());
+    System.out.println("content type: " + mtype);
     System.out.println("title: " + parse.getTitle());
     System.out.println("text: " + parse.getText());
     System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));

Modified: nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java Fri Jun 15 11:27:09 2012
@@ -82,8 +82,8 @@ public class TestMSWordParser extends Te
 	page.setContent(ByteBuffer.wrap(bytes));
 	// set the content type?
 	MimeUtil mimeutil = new MimeUtil(conf);
-	MimeType mtype = mimeutil.getMimeType(file);
-	page.setContentType(new Utf8(mtype.getName()));
+	String mtype = mimeutil.getMimeType(file);
+	page.setContentType(new Utf8(mtype));
 		
 	parse = new ParseUtil(conf).parse("file:"+urlString, page);
 	return parse.getText();

Modified: nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java Fri Jun 15 11:27:09 2012
@@ -106,8 +106,8 @@ public class TestOOParser extends TestCa
 	    WebPage page = new WebPage();
 	    page.setBaseUrl(new Utf8(urlString));
 	    page.setContent(ByteBuffer.wrap(bytes));
-	    MimeType mtype = mimeutil.getMimeType(file);
-	    page.setContentType(new Utf8(mtype.getName()));
+	    String mtype = mimeutil.getMimeType(file);
+	    page.setContentType(new Utf8(mtype));
 
 	    parse = new ParseUtil(conf).parse(urlString, page);
 

Modified: nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java Fri Jun 15 11:27:09 2012
@@ -81,8 +81,8 @@ public class TestPdfParser extends TestC
 	    WebPage page = new WebPage();
 	    page.setBaseUrl(new Utf8(urlString));
 	    page.setContent(ByteBuffer.wrap(bytes));
-	    MimeType mtype = mimeutil.getMimeType(file);
-	    page.setContentType(new Utf8(mtype.getName()));
+	    String mtype = mimeutil.getMimeType(file);
+	    page.setContentType(new Utf8(mtype));
 
 	    parse = new ParseUtil(conf).parse(urlString, page);
 

Modified: nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java (original)
+++ nutch/branches/nutchgora/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java Fri Jun 15 11:27:09 2012
@@ -102,8 +102,8 @@ public class TestRSSParser extends TestC
       WebPage page = new WebPage();
       page.setBaseUrl(new Utf8(urlString));
       page.setContent(ByteBuffer.wrap(bytes));
-      MimeType mtype = mimeutil.getMimeType(file);
-      page.setContentType(new Utf8(mtype.getName()));
+      String mtype = mimeutil.getMimeType(file);
+      page.setContentType(new Utf8(mtype));
 
       parse = new ParseUtil(conf).parse(urlString, page);
 

Modified: nutch/branches/nutchgora/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1350580&r1=1350579&r2=1350580&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original)
+++ nutch/branches/nutchgora/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Fri Jun 15 11:27:09 2012
@@ -209,8 +209,8 @@ public class FileResponse {
     headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
         .lastModified()));
     
-    MimeType mimeType = MIME.getMimeType(f);
-    String mimeTypeString = mimeType != null ? mimeType.getName() : "";
+    String mimeType = MIME.getMimeType(f);
+    String mimeTypeString = mimeType != null ? mimeType.toString() : "";
     headers.set(Response.CONTENT_TYPE, mimeTypeString);
 
     // response code