You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/21 13:55:50 UTC

svn commit: r1204476 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Author: nick
Date: Mon Nov 21 12:55:49 2011
New Revision: 1204476

URL: http://svn.apache.org/viewvc?rev=1204476&view=rev
Log:
TIKA-786 Control the ordering of detectors in DefaultDetector, so that user supplied detectors come first, then Tika ones, and finally MimeTypes. This ensures that more specific detectors get to try first

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java?rev=1204476&r1=1204475&r2=1204476&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java Mon Nov 21 12:55:49 2011
@@ -17,6 +17,8 @@
 package org.apache.tika.detect;
 
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 
 import javax.imageio.spi.ServiceRegistry;
@@ -27,6 +29,12 @@ import org.apache.tika.mime.MimeTypes;
 /**
  * A composite detector based on all the {@link Detector} implementations
  * available through the {@link ServiceRegistry service provider mechanism}.
+ * 
+ * Detectors are loaded and returned in a specified order, of user supplied
+ *  followed by non-MimeType Tika, followed by the Tika MimeType class.
+ * If you need to control the order of the Detectors, you should instead
+ *  construct your own {@link CompositeDetector} and pass in the list
+ *  of Detectors in the required order.
  *
  * @since Apache Tika 0.9
  */
@@ -37,9 +45,35 @@ public class DefaultDetector extends Com
 
     private static List<Detector> getDefaultDetectors(
             MimeTypes types, ServiceLoader loader) {
-        List<Detector> detectors = new ArrayList<Detector>();
+        // Find all the detectors available as services
+        List<Detector> svcDetectors = loader.loadServiceProviders(Detector.class);
+        List<Detector> detectors = new ArrayList<Detector>(svcDetectors.size()+1);
+        
+        // Sort the list by classname, rather than discovery order 
+        Collections.sort(svcDetectors, new Comparator<Detector>() {
+            public int compare(Detector d1, Detector d2) {
+               return d1.getClass().getName().compareTo(
+                     d2.getClass().getName());
+            }
+        });
+        
+        // Add the non-Tika (user supplied) detectors First
+        for (Detector d : svcDetectors) {
+           if (! d.getClass().getName().startsWith("org.apache.tika")) {
+              detectors.add(d);
+           }
+        }
+        
+        // Add the Tika detectors next
+        for (Detector d : svcDetectors) {
+           if (d.getClass().getName().startsWith("org.apache.tika")) {
+              detectors.add(d);
+           }
+        }
+        
+        // Finally add the Tika MimeTypes as a fallback
         detectors.add(types);
-        detectors.addAll(loader.loadServiceProviders(Detector.class));
+        
         return detectors;
     }
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1204476&r1=1204475&r2=1204476&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Nov 21 12:55:49 2011
@@ -77,10 +77,9 @@ public class TestContainerAwareDetector 
         assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint");
         
         // With the wrong filename supplied, data will trump filename
-        // TODO Fix this! (TIKA-786)
-//        assertTypeByNameAndData("testEXCEL.xls", "notWord.doc",  "application/vnd.ms-excel");
-//        assertTypeByNameAndData("testWORD.doc",  "notExcel.xls", "application/msword");
-//        assertTypeByNameAndData("testPPT.ppt",   "notWord.doc",  "application/vnd.ms-powerpoint");
+        assertTypeByNameAndData("testEXCEL.xls", "notWord.doc",  "application/vnd.ms-excel");
+        assertTypeByNameAndData("testWORD.doc",  "notExcel.xls", "application/msword");
+        assertTypeByNameAndData("testPPT.ppt",   "notWord.doc",  "application/vnd.ms-powerpoint");
         
         // With a filename of a totally different type, data will trump filename
         assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf",  "application/vnd.ms-excel");
@@ -127,10 +126,9 @@ public class TestContainerAwareDetector 
         assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
         
         // With the wrong filename supplied, data will trump filename
-        // TODO Fix this! (TIKA-786)
-//        assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-//        assertTypeByNameAndData("testWORD.docx",  "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-//        assertTypeByNameAndData("testPPT.pptx",   "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+        assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+        assertTypeByNameAndData("testWORD.docx",  "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+        assertTypeByNameAndData("testPPT.pptx",   "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
         
         // With an incorrect filename of a different container type, data trumps filename
         assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");