You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/02/28 14:37:33 UTC

svn commit: r1662940 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/main/java/org/apache/tika/parser/ tika-parsers/src/test/java/org/apache/tika/config/

Author: nick
Date: Sat Feb 28 13:37:33 2015
New Revision: 1662940

URL: http://svn.apache.org/r1662940
Log:
TIKA-1558 Support excluding (blacklisting) parsers from config, so you can use DefaultParser for all except certain parsers. Also supports child parsers of a composite parser from config, towards TIKA-1509

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Feb 28 13:37:33 2015
@@ -20,8 +20,11 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
@@ -332,60 +335,112 @@ public class TikaConfig {
         NodeList nodes = element.getElementsByTagName("parser");
         for (int i = 0; i < nodes.getLength(); i++) {
             Element node = (Element) nodes.item(i);
-            String name = node.getAttribute("class");
-            Parser parser = null;
+            parsers.add(parserFromParserDomElement(node, mimeTypes, loader));
+        }
+        
+        if (parsers.isEmpty()) {
+            // No parsers defined, create a DefaultParser
+            return getDefaultParser(mimeTypes, loader);
+        } else if (parsers.size() == 1 && parsers.get(0) instanceof CompositeParser) {
+            // Single Composite defined, use that
+            return (CompositeParser)parsers.get(0);
+        } else {
+            // Wrap the defined parsers up in a Composite
+            MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+            return new CompositeParser(registry, parsers);
+        }
+    }
+    private static Parser parserFromParserDomElement(
+            Element parserNode, MimeTypes mimeTypes, ServiceLoader loader)
+            throws TikaException, IOException {
+        String name = parserNode.getAttribute("class");
+        Parser parser = null;
 
-            try {
-                Class<? extends Parser> parserClass =
-                        loader.getServiceClass(Parser.class, name);
-                // https://issues.apache.org/jira/browse/TIKA-866
-                if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
-                    throw new TikaException(
-                            "AutoDetectParser not supported in a <parser>"
-                            + " configuration element: " + name);
-                }
+        try {
+            Class<? extends Parser> parserClass =
+                    loader.getServiceClass(Parser.class, name);
+            // https://issues.apache.org/jira/browse/TIKA-866
+            if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
+                throw new TikaException(
+                        "AutoDetectParser not supported in a <parser>"
+                        + " configuration element: " + name);
+            }
 
-                // Is this a composite parser? If so, support recursion
-                if (CompositeParser.class.isAssignableFrom(parserClass)) {
-                    // TODO Implement
-                    System.err.println("WARNING: Not building " + parserClass + " as composite!");
-                    parser = parserClass.newInstance();
-                } else {
-                    // Regular parser, create as-is
-                    parser = parserClass.newInstance();
+            // Is this a composite parser? If so, support recursion
+            if (CompositeParser.class.isAssignableFrom(parserClass)) {
+                // Get the child parsers for it
+                List<Parser> childParsers = new ArrayList<Parser>();
+                NodeList childParserNodes = parserNode.getElementsByTagName("parser");
+                if (childParserNodes.getLength() > 0) {
+                    for (int i = 0; i < childParserNodes.getLength(); i++) {
+                        childParsers.add(parserFromParserDomElement(
+                                (Element)childParserNodes.item(i), mimeTypes, loader
+                        ));
+                    }
                 }
-
-                // Is there an explicit list of mime types for this to handle?
-                Set<MediaType> parserTypes = mediaTypesListFromDomElement(node, "mime");
-                if (! parserTypes.isEmpty()) {
-                    parser = ParserDecorator.withTypes(parser, parserTypes);
+                
+                // Get the list of parsers to exclude
+                Set<Class<? extends Parser>> excludeParsers = new HashSet<Class<? extends Parser>>();
+                NodeList excludeParserNodes = parserNode.getElementsByTagName("parser-exclude");
+                if (excludeParserNodes.getLength() > 0) {
+                    for (int i = 0; i < excludeParserNodes.getLength(); i++) {
+                        Element excl = (Element)excludeParserNodes.item(i);
+                        String exclName = excl.getAttribute("class");
+                        excludeParsers.add(loader.getServiceClass(Parser.class, exclName));
+                    }
+                }
+                
+                // Create the Composite Parser
+                Constructor<? extends Parser> c = null;
+                if (c == null) {
+                    try {
+                        c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, Collection.class);
+                        parser = c.newInstance(mimeTypes.getMediaTypeRegistry(), loader, excludeParsers);
+                    } 
+                    catch (NoSuchMethodException me) {}
                 }
-                // Is there an explicit list of mime types this shouldn't handle?
-                Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(node, "mime-exclude");
-                if (! parserExclTypes.isEmpty()) {
-                    parser = ParserDecorator.withoutTypes(parser, parserExclTypes);
+                if (c == null) {
+                    try {
+                        c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class);
+                        parser = c.newInstance(mimeTypes.getMediaTypeRegistry(), childParsers, excludeParsers);
+                    } catch (NoSuchMethodException me) {}
+                }
+                if (c == null) {
+                    parser = parserClass.newInstance();
                 }
+            } else {
+                // Regular parser, create as-is
+                parser = parserClass.newInstance();
+            }
 
-                // All done with setup
-                parsers.add(parser);
-            } catch (ClassNotFoundException e) {
-                throw new TikaException(
-                        "Unable to find a parser class: " + name, e);
-            } catch (IllegalAccessException e) {
-                throw new TikaException(
-                        "Unable to access a parser class: " + name, e);
-            } catch (InstantiationException e) {
-                throw new TikaException(
-                        "Unable to instantiate a parser class: " + name, e);
+            // Is there an explicit list of mime types for this to handle?
+            Set<MediaType> parserTypes = mediaTypesListFromDomElement(parserNode, "mime");
+            if (! parserTypes.isEmpty()) {
+                parser = ParserDecorator.withTypes(parser, parserTypes);
             }
-        }
-        if (parsers.isEmpty()) {
-            return getDefaultParser(mimeTypes, loader);
-        } else {
-            MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
-            return new CompositeParser(registry, parsers);
+            // Is there an explicit list of mime types this shouldn't handle?
+            Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(parserNode, "mime-exclude");
+            if (! parserExclTypes.isEmpty()) {
+                parser = ParserDecorator.withoutTypes(parser, parserExclTypes);
+            }
+            
+            // All done with setup
+            return parser;
+        } catch (ClassNotFoundException e) {
+            throw new TikaException(
+                    "Unable to find a parser class: " + name, e);
+        } catch (IllegalAccessException e) {
+            throw new TikaException(
+                    "Unable to access a parser class: " + name, e);
+        } catch (InvocationTargetException e) {
+            throw new TikaException(
+                    "Unable to create a parser class: " + name, e);
+        } catch (InstantiationException e) {
+            throw new TikaException(
+                    "Unable to instantiate a parser class: " + name, e);
         }
     }
+    
     private static Set<MediaType> mediaTypesListFromDomElement(
             Element node, String tag) 
             throws TikaException, IOException {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Sat Feb 28 13:37:33 2015
@@ -30,6 +30,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
@@ -62,10 +63,23 @@ public class CompositeParser extends Abs
      */
     private Parser fallback = new EmptyParser();
 
-    public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
-        this.parsers = parsers;
+    public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers,
+                           Collection<Class<? extends Parser>> excludeParsers) {
+        if (excludeParsers == null || excludeParsers.isEmpty()) {
+            this.parsers = parsers;
+        } else {
+            this.parsers = new ArrayList<Parser>();
+            for (Parser p : parsers) {
+                if (! excludeParsers.contains(p.getClass())) {
+                    this.parsers.add(p);
+                }
+            }
+        }
         this.registry = registry;
     }
+    public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
+        this(registry, parsers, null);
+    }
 
     public CompositeParser(MediaTypeRegistry registry, Parser... parsers) {
         this(registry, Arrays.asList(parsers));

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java Sat Feb 28 13:37:33 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser;
 
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
@@ -69,10 +70,15 @@ public class DefaultParser extends Compo
 
     private transient final ServiceLoader loader;
 
-    public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
-        super(registry, getDefaultParsers(loader));
+    public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
+                         Collection<Class<? extends Parser>> excludeParsers) {
+        super(registry, getDefaultParsers(loader), excludeParsers);
         this.loader = loader;
     }
+    
+    public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
+        this(registry, loader, null);
+    }
 
     public DefaultParser(MediaTypeRegistry registry, ClassLoader loader) {
         this(registry, new ServiceLoader(loader));

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java Sat Feb 28 13:37:33 2015
@@ -123,8 +123,6 @@ public class TikaParserConfigTest {
         
         
         // The one from the config won't
-        // TODO - Finish this
-/*
         assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
         assertNotContained(ELF, confParser.getSupportedTypes(context));
         
@@ -132,6 +130,5 @@ public class TikaParserConfigTest {
             if (p instanceof ExecutableParser)
                 fail("Shouldn't have the Executable Parser from config");
         }
-*/
     }
 }