You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/02/28 14:37:33 UTC
svn commit: r1662940 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/config/
tika-core/src/main/java/org/apache/tika/parser/
tika-parsers/src/test/java/org/apache/tika/config/
Author: nick
Date: Sat Feb 28 13:37:33 2015
New Revision: 1662940
URL: http://svn.apache.org/r1662940
Log:
TIKA-1558 Support excluding (blacklisting) parsers from config, so you can use DefaultParser for all except certain parsers. Also supports child parsers of a composite parser from config, towards TIKA-1509
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Feb 28 13:37:33 2015
@@ -20,8 +20,11 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
import java.net.URL;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
@@ -332,60 +335,112 @@ public class TikaConfig {
NodeList nodes = element.getElementsByTagName("parser");
for (int i = 0; i < nodes.getLength(); i++) {
Element node = (Element) nodes.item(i);
- String name = node.getAttribute("class");
- Parser parser = null;
+ parsers.add(parserFromParserDomElement(node, mimeTypes, loader));
+ }
+
+ if (parsers.isEmpty()) {
+ // No parsers defined, create a DefaultParser
+ return getDefaultParser(mimeTypes, loader);
+ } else if (parsers.size() == 1 && parsers.get(0) instanceof CompositeParser) {
+ // Single Composite defined, use that
+ return (CompositeParser)parsers.get(0);
+ } else {
+ // Wrap the defined parsers up in a Composite
+ MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+ return new CompositeParser(registry, parsers);
+ }
+ }
+ private static Parser parserFromParserDomElement(
+ Element parserNode, MimeTypes mimeTypes, ServiceLoader loader)
+ throws TikaException, IOException {
+ String name = parserNode.getAttribute("class");
+ Parser parser = null;
- try {
- Class<? extends Parser> parserClass =
- loader.getServiceClass(Parser.class, name);
- // https://issues.apache.org/jira/browse/TIKA-866
- if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
- throw new TikaException(
- "AutoDetectParser not supported in a <parser>"
- + " configuration element: " + name);
- }
+ try {
+ Class<? extends Parser> parserClass =
+ loader.getServiceClass(Parser.class, name);
+ // https://issues.apache.org/jira/browse/TIKA-866
+ if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
+ throw new TikaException(
+ "AutoDetectParser not supported in a <parser>"
+ + " configuration element: " + name);
+ }
- // Is this a composite parser? If so, support recursion
- if (CompositeParser.class.isAssignableFrom(parserClass)) {
- // TODO Implement
- System.err.println("WARNING: Not building " + parserClass + " as composite!");
- parser = parserClass.newInstance();
- } else {
- // Regular parser, create as-is
- parser = parserClass.newInstance();
+ // Is this a composite parser? If so, support recursion
+ if (CompositeParser.class.isAssignableFrom(parserClass)) {
+ // Get the child parsers for it
+ List<Parser> childParsers = new ArrayList<Parser>();
+ NodeList childParserNodes = parserNode.getElementsByTagName("parser");
+ if (childParserNodes.getLength() > 0) {
+ for (int i = 0; i < childParserNodes.getLength(); i++) {
+ childParsers.add(parserFromParserDomElement(
+ (Element)childParserNodes.item(i), mimeTypes, loader
+ ));
+ }
}
-
- // Is there an explicit list of mime types for this to handle?
- Set<MediaType> parserTypes = mediaTypesListFromDomElement(node, "mime");
- if (! parserTypes.isEmpty()) {
- parser = ParserDecorator.withTypes(parser, parserTypes);
+
+ // Get the list of parsers to exclude
+ Set<Class<? extends Parser>> excludeParsers = new HashSet<Class<? extends Parser>>();
+ NodeList excludeParserNodes = parserNode.getElementsByTagName("parser-exclude");
+ if (excludeParserNodes.getLength() > 0) {
+ for (int i = 0; i < excludeParserNodes.getLength(); i++) {
+ Element excl = (Element)excludeParserNodes.item(i);
+ String exclName = excl.getAttribute("class");
+ excludeParsers.add(loader.getServiceClass(Parser.class, exclName));
+ }
+ }
+
+ // Create the Composite Parser
+ Constructor<? extends Parser> c = null;
+ if (c == null) {
+ try {
+ c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, Collection.class);
+ parser = c.newInstance(mimeTypes.getMediaTypeRegistry(), loader, excludeParsers);
+ }
+ catch (NoSuchMethodException me) {}
}
- // Is there an explicit list of mime types this shouldn't handle?
- Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(node, "mime-exclude");
- if (! parserExclTypes.isEmpty()) {
- parser = ParserDecorator.withoutTypes(parser, parserExclTypes);
+ if (c == null) {
+ try {
+ c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class);
+ parser = c.newInstance(mimeTypes.getMediaTypeRegistry(), childParsers, excludeParsers);
+ } catch (NoSuchMethodException me) {}
+ }
+ if (c == null) {
+ parser = parserClass.newInstance();
}
+ } else {
+ // Regular parser, create as-is
+ parser = parserClass.newInstance();
+ }
- // All done with setup
- parsers.add(parser);
- } catch (ClassNotFoundException e) {
- throw new TikaException(
- "Unable to find a parser class: " + name, e);
- } catch (IllegalAccessException e) {
- throw new TikaException(
- "Unable to access a parser class: " + name, e);
- } catch (InstantiationException e) {
- throw new TikaException(
- "Unable to instantiate a parser class: " + name, e);
+ // Is there an explicit list of mime types for this to handle?
+ Set<MediaType> parserTypes = mediaTypesListFromDomElement(parserNode, "mime");
+ if (! parserTypes.isEmpty()) {
+ parser = ParserDecorator.withTypes(parser, parserTypes);
}
- }
- if (parsers.isEmpty()) {
- return getDefaultParser(mimeTypes, loader);
- } else {
- MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
- return new CompositeParser(registry, parsers);
+ // Is there an explicit list of mime types this shouldn't handle?
+ Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(parserNode, "mime-exclude");
+ if (! parserExclTypes.isEmpty()) {
+ parser = ParserDecorator.withoutTypes(parser, parserExclTypes);
+ }
+
+ // All done with setup
+ return parser;
+ } catch (ClassNotFoundException e) {
+ throw new TikaException(
+ "Unable to find a parser class: " + name, e);
+ } catch (IllegalAccessException e) {
+ throw new TikaException(
+ "Unable to access a parser class: " + name, e);
+ } catch (InvocationTargetException e) {
+ throw new TikaException(
+ "Unable to create a parser class: " + name, e);
+ } catch (InstantiationException e) {
+ throw new TikaException(
+ "Unable to instantiate a parser class: " + name, e);
}
}
+
private static Set<MediaType> mediaTypesListFromDomElement(
Element node, String tag)
throws TikaException, IOException {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Sat Feb 28 13:37:33 2015
@@ -30,6 +30,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
@@ -62,10 +63,23 @@ public class CompositeParser extends Abs
*/
private Parser fallback = new EmptyParser();
- public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
- this.parsers = parsers;
+ public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers,
+ Collection<Class<? extends Parser>> excludeParsers) {
+ if (excludeParsers == null || excludeParsers.isEmpty()) {
+ this.parsers = parsers;
+ } else {
+ this.parsers = new ArrayList<Parser>();
+ for (Parser p : parsers) {
+ if (! excludeParsers.contains(p.getClass())) {
+ this.parsers.add(p);
+ }
+ }
+ }
this.registry = registry;
}
+ public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers) {
+ this(registry, parsers, null);
+ }
public CompositeParser(MediaTypeRegistry registry, Parser... parsers) {
this(registry, Arrays.asList(parsers));
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java Sat Feb 28 13:37:33 2015
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser;
+import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@@ -69,10 +70,15 @@ public class DefaultParser extends Compo
private transient final ServiceLoader loader;
- public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
- super(registry, getDefaultParsers(loader));
+ public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
+ Collection<Class<? extends Parser>> excludeParsers) {
+ super(registry, getDefaultParsers(loader), excludeParsers);
this.loader = loader;
}
+
+ public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
+ this(registry, loader, null);
+ }
public DefaultParser(MediaTypeRegistry registry, ClassLoader loader) {
this(registry, new ServiceLoader(loader));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java?rev=1662940&r1=1662939&r2=1662940&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java Sat Feb 28 13:37:33 2015
@@ -123,8 +123,6 @@ public class TikaParserConfigTest {
// The one from the config won't
- // TODO - Finish this
-/*
assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
assertNotContained(ELF, confParser.getSupportedTypes(context));
@@ -132,6 +130,5 @@ public class TikaParserConfigTest {
if (p instanceof ExecutableParser)
fail("Shouldn't have the Executable Parser from config");
}
-*/
}
}