You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/01 16:53:36 UTC
svn commit: r1693713 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/config/TikaConfig.java
test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
Author: nick
Date: Sat Aug 1 14:53:36 2015
New Revision: 1693713
URL: http://svn.apache.org/r1693713
Log:
TIKA-1702 Refactor some of the config parser loading to be more re-usable for detectors, and bring the method signature in line WRT Composite vs not (must always be composite)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693713&r1=1693712&r2=1693713&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 14:53:36 2015
@@ -66,7 +66,7 @@ public class TikaConfig {
return MimeTypes.getDefaultMimeTypes(loader);
}
- protected Detector getDefaultDetector(
+ protected CompositeDetector getDefaultDetector(
MimeTypes types, ServiceLoader loader) {
return new DefaultDetector(types, loader);
}
@@ -80,7 +80,7 @@ public class TikaConfig {
return new DefaultTranslator(loader);
}
private final CompositeParser parser;
- private final Detector detector;
+ private final CompositeDetector detector;
private final Translator translator;
private final MimeTypes mimeTypes;
@@ -317,6 +317,33 @@ public class TikaConfig {
}
return null;
}
+ private static List<Element> getTopLevelElementChildren(Element element,
+ String parentName, String childrenName) throws TikaException {
+ // Should be only zero or one <parsers> / <detectors> etc tag
+ NodeList nodes = element.getElementsByTagName(parentName);
+ if (nodes.getLength() > 1) {
+ throw new TikaException("Properties may not contain multiple "+parentName+" entries");
+ }
+ else if (nodes.getLength() == 1) {
+ // Find only the direct child parser/detector objects
+ Node parsersE = nodes.item(0);
+ nodes = parsersE.getChildNodes();
+ List<Element> elements = new ArrayList<Element>();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node instanceof Element) {
+ Element nodeE = (Element)node;
+ if (childrenName.equals(nodeE.getTagName())) {
+ elements.add(nodeE);
+ }
+ }
+ }
+ return elements;
+ } else {
+ // No elements of this type
+ return Collections.emptyList();
+ }
+ }
private static MimeTypes typesFromDomElement(Element element)
throws TikaException, IOException {
@@ -333,24 +360,9 @@ public class TikaConfig {
throws TikaException, IOException {
List<Parser> parsers = new ArrayList<Parser>();
- // Should be only zero or one <parsers> tag
- NodeList nodes = element.getElementsByTagName("parsers");
- if (nodes.getLength() > 1) {
- throw new TikaException("Properties may not contain multiple Parsers entries");
- }
- else if (nodes.getLength() == 1) {
- // Find only the direct child parser objects
- Node parsersE = nodes.item(0);
- nodes = parsersE.getChildNodes();
- for (int i = 0; i < nodes.getLength(); i++) {
- Node node = nodes.item(i);
- if (node instanceof Element) {
- Element nodeE = (Element)node;
- if ("parser".equals(nodeE.getTagName())) {
- parsers.add(parserFromParserDomElement(nodeE, mimeTypes, loader));
- }
- }
- }
+ // Find the parser children of the parsers tag, if any
+ for (Element pe : getTopLevelElementChildren(element, "parsers", "parser")) {
+ parsers.add(parserFromParserDomElement(pe, mimeTypes, loader));
}
if (parsers.isEmpty()) {
@@ -500,7 +512,7 @@ public class TikaConfig {
return Collections.emptySet();
}
- private static Detector detectorFromDomElement(
+ private static CompositeDetector detectorFromDomElement(
Element element, MimeTypes mimeTypes, ServiceLoader loader)
throws TikaException, IOException {
List<Detector> detectors = new ArrayList<Detector>();
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java?rev=1693713&r1=1693712&r2=1693713&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java Sat Aug 1 14:53:36 2015
@@ -29,14 +29,10 @@ import java.nio.charset.Charset;
import org.apache.tika.Tika;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultProbDetector;
-import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.mime.ProbabilisticMimeDetectionSelector;
import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder;
import org.junit.Before;
import org.junit.Test;
@@ -54,7 +50,7 @@ public class ProbabilisticMimeDetectionT
registry = MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
tika = new Tika(new TikaConfig() {
@Override
- protected Detector getDefaultDetector(MimeTypes types,
+ protected CompositeDetector getDefaultDetector(MimeTypes types,
ServiceLoader loader) {
/*
* here is an example with the use of the builder to