You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/02 15:21:08 UTC
[1/2] tika git commit: TIKA-1657 move xmlification of TikaConfig to
tika-core. Thank you, Nick!
Repository: tika
Updated Branches:
refs/heads/master 9056894da -> 5a3410715
TIKA-1657 move xmlification of TikaConfig to tika-core. Thank you, Nick!
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3aa1dca4
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3aa1dca4
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3aa1dca4
Branch: refs/heads/master
Commit: 3aa1dca4eef13de99b83989010fe02bfd391b378
Parents: 9056894
Author: tballison <ta...@mitre.org>
Authored: Wed Mar 2 09:18:46 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Mar 2 09:18:46 2016 -0500
----------------------------------------------------------------------
.../main/java/org/apache/tika/cli/TikaCLI.java | 26 +-
.../java/org/apache/tika/cli/TikaCLITest.java | 50 +++-
.../test/resources/test-data/tika-config2.xml | 14 +
.../tika/config/TikaConfigSerializer.java | 256 +++++++++++++++++++
.../tika/config/TikaConfigSerializerTest.java | 60 +++++
.../tika/example/DumpTikaConfigExample.java | 233 +----------------
.../tika/example/DumpTikaConfigExampleTest.java | 6 +-
7 files changed, 413 insertions(+), 232 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 50f3463..4458526 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -73,6 +73,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.Tika;
import org.apache.tika.batch.BatchProcessDriverCLI;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigSerializer;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -326,6 +327,8 @@ public class TikaCLI {
private Parser parser;
+ private TikaConfig config;
+
private String configFilePath;
private OutputType type = XML;
@@ -405,6 +408,15 @@ public class TikaCLI {
} else if (arg.startsWith("--compare-file-magic=")) {
pipeMode = false;
compareFileMagic(arg.substring(arg.indexOf('=')+1));
+ } else if (arg.equals("--dump-minimal-config")) {
+ pipeMode = false;
+ dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
+ } else if (arg.equals("--dump-current-config")) {
+ pipeMode = false;
+ dumpConfig(TikaConfigSerializer.Mode.CURRENT);
+ } else if (arg.equals("--dump-static-config")) {
+ pipeMode = false;
+ dumpConfig(TikaConfigSerializer.Mode.STATIC);
} else if (arg.equals("--container-aware")
|| arg.equals("--container-aware-detector")) {
// ignore, as container-aware detectors are now always used
@@ -497,6 +509,13 @@ public class TikaCLI {
}
}
+ private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception {
+ TikaConfig localConfig = (config == null) ? TikaConfig.getDefaultConfig() : config;
+
+ TikaConfigSerializer.serialize(localConfig, mode,
+ new OutputStreamWriter(System.out, UTF_8), UTF_8);
+ }
+
private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
@@ -541,7 +560,10 @@ public class TikaCLI {
out.println(" -f or --fork Use Fork Mode for out-of-process extraction");
out.println();
out.println(" --config=<tika-config.xml>");
- out.println(" TikaConfig file. Must be specified before -g, -s or -f!");
+ out.println(" TikaConfig file. Must be specified before -g, -s, -f or the dump-x-config !");
+ out.println(" --dump-minimal-config Print minimal TikaConfig");
+ out.println(" --dump-current-config Print current TikaConfig");
+ out.println(" --dump-static-config Print static config");
out.println("");
out.println(" -x or --xml Output XHTML content (default)");
out.println(" -h or --html Output HTML content");
@@ -673,7 +695,7 @@ public class TikaCLI {
private void configure(String configFilePath) throws Exception {
this.configFilePath = configFilePath;
- TikaConfig config = new TikaConfig(new File(configFilePath));
+ config = new TikaConfig(new File(configFilePath));
parser = new AutoDetectParser(config);
if (digester != null) {
parser = new DigestingParser(parser, digester);
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index f9d5a5d..9fc8ee8 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -377,7 +377,6 @@ public class TikaCLITest {
" \"Character-Count-With-Spaces\": \"31\","));
assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\""));
assertFalse(content.contains("X-TIKA:content"));
-
}
@Test
@@ -406,4 +405,53 @@ public class TikaCLITest {
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\""));
}
+ @Test
+ public void testConfigSerializationStaticAndCurrent() throws Exception {
+ String[] params = new String[]{"--dump-static-config"};
+ TikaCLI.main(params);
+ String content = outContent.toString(UTF_8.name());
+ //make sure at least one detector is there
+ assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
+ //make sure Executable is there because follow on tests of custom config
+ //test that it has been turned off.
+ assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
+
+ params = new String[]{"--dump-current-config"};
+ TikaCLI.main(params);
+ content = outContent.toString(UTF_8.name());
+ //make sure at least one detector is there
+ assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
+ //and at least one parser
+ assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
+ }
+
+ @Test
+ public void testConfigSerializationCustomMinimal() throws Exception {
+ String[] params = new String[]{
+ "--config=" + testDataFile.toString() + "/tika-config2.xml",
+ "--dump-minimal-config"};
+ TikaCLI.main(params);
+ String content = outContent.toString(UTF_8.name()).replaceAll("[\r\n\t ]+", " ");
+
+ String expected =
+ "<parser class=\"org.apache.tika.parser.DefaultParser\">" +
+ " <mime-exclude>application/pdf</mime-exclude>" +
+ " <mime-exclude>image/jpeg</mime-exclude> " +
+ "</parser> " +
+ "<parser class=\"org.apache.tika.parser.EmptyParser\">" +
+ " <mime>application/pdf</mime> " +
+ "</parser>";
+ assertTrue(content.contains(expected));
+ }
+
+ @Test
+ public void testConfigSerializationCustomStatic() throws Exception {
+ String[] params = new String[]{
+ "--config=" + testDataFile.toString() + "/tika-config2.xml", "--dump-static-config"};
+ TikaCLI.main(params);
+ String content = outContent.toString(UTF_8.name());
+ assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
+ }
+
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/test/resources/test-data/tika-config2.xml
----------------------------------------------------------------------
diff --git a/tika-app/src/test/resources/test-data/tika-config2.xml b/tika-app/src/test/resources/test-data/tika-config2.xml
new file mode 100644
index 0000000..3a511ed
--- /dev/null
+++ b/tika-app/src/test/resources/test-data/tika-config2.xml
@@ -0,0 +1,14 @@
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <mime-exclude>image/jpeg</mime-exclude>
+ <mime-exclude>application/pdf</mime-exclude>
+ <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/>
+ <parser-exclu class="org.apache.tika.parser.executable.ExecutableParser2"/>
+ </parser>
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>application/pdf</mime>
+ <no-mime>hello/world</no-mime>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
new file mode 100644
index 0000000..3c19cfd
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.language.translate.DefaultTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+
+public class TikaConfigSerializer {
+
+ public enum Mode {
+ MINIMAL, CURRENT, STATIC;
+ }
+
+ /**
+ *
+ * @param config config to serialize
+ * @param mode serialization mode
+ * @param writer writer
+ * @param charset charset
+ * @throws Exception
+ */
+ public static void serialize(TikaConfig config, Mode mode, Writer writer, Charset charset)
+ throws Exception {
+ DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+
+ // root elements
+ Document doc = docBuilder.newDocument();
+ Element rootElement = doc.createElement("properties");
+
+ doc.appendChild(rootElement);
+ addMimeComment(mode, rootElement, doc);
+ addServiceLoader(mode, rootElement, doc, config);
+ addExecutorService(mode, rootElement, doc, config);
+ addTranslator(mode, rootElement, doc, config);
+ addDetectors(mode, rootElement, doc, config);
+ addParsers(mode, rootElement, doc, config);
+ // TODO Service Loader section
+
+ // now write
+ TransformerFactory transformerFactory = TransformerFactory.newInstance();
+ Transformer transformer = transformerFactory.newTransformer();
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+ transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+ transformer.setOutputProperty(OutputKeys.ENCODING, charset.name());
+ DOMSource source = new DOMSource(doc);
+ StreamResult result = new StreamResult(writer);
+
+ transformer.transform(source, result);
+ }
+
+ private static void addExecutorService(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+ //TODO
+ }
+
+ private static void addServiceLoader(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+ ServiceLoader loader = config.getServiceLoader();
+
+ if (mode == Mode.MINIMAL) {
+ // Is this the default?
+ if (loader.isDynamic() && loader.getLoadErrorHandler() == LoadErrorHandler.IGNORE) {
+ // Default config, no need to output anything
+ return;
+ }
+ }
+
+ Element dslEl = doc.createElement("service-loader");
+ dslEl.setAttribute("dynamic", Boolean.toString(loader.isDynamic()));
+ dslEl.setAttribute("loadErrorHandler", loader.getLoadErrorHandler().toString());
+ rootElement.appendChild(dslEl);
+ }
+
+ private static void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+ // Unlike the other entries, TikaConfig only wants one of
+ // these, and no outer <translators> list
+ Translator translator = config.getTranslator();
+ if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
+ Node mimeComment = doc.createComment(
+ "for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
+ rootElement.appendChild(mimeComment);
+ } else {
+ if (translator instanceof DefaultTranslator && mode == Mode.STATIC) {
+ translator = ((DefaultTranslator)translator).getTranslator();
+ }
+ if (translator != null) {
+ Element translatorElement = doc.createElement("translator");
+ translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
+ rootElement.appendChild(translatorElement);
+ } else {
+ rootElement.appendChild(doc.createComment("No translators available"));
+ }
+ }
+ }
+
+ private static void addMimeComment(Mode mode, Element rootElement, Document doc) {
+ Node mimeComment = doc.createComment(
+ "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
+ rootElement.appendChild(mimeComment);
+ }
+
+ private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
+ Detector detector = config.getDetector();
+
+ if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
+ // Don't output anything, all using defaults
+ Node detComment = doc.createComment(
+ "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
+ rootElement.appendChild(detComment);
+ return;
+ }
+
+ Element detectorsElement = doc.createElement("detectors");
+ if (mode == Mode.CURRENT && detector instanceof DefaultDetector ||
+ ! (detector instanceof CompositeDetector)) {
+ Element detectorElement = doc.createElement("detector");
+ detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
+ detectorsElement.appendChild(detectorElement);
+ } else {
+ List<Detector> children = ((CompositeDetector)detector).getDetectors();
+ for (Detector d : children) {
+ Element detectorElement = doc.createElement("detector");
+ detectorElement.setAttribute("class", d.getClass().getCanonicalName());
+ detectorsElement.appendChild(detectorElement);
+ }
+ }
+ rootElement.appendChild(detectorsElement);
+ }
+
+ private static void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
+ Parser parser = config.getParser();
+ if (mode == Mode.MINIMAL && parser instanceof DefaultParser) {
+ // Don't output anything, all using defaults
+ return;
+ } else if (mode == Mode.MINIMAL) {
+ mode = Mode.CURRENT;
+ }
+
+ Element parsersElement = doc.createElement("parsers");
+ rootElement.appendChild(parsersElement);
+
+ addParser(mode, parsersElement, doc, parser);
+ }
+
+ private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
+ // If the parser is decorated, is it a kind where we output the parser inside?
+ ParserDecorator decoration = null;
+ if (parser instanceof ParserDecorator) {
+ if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) {
+ decoration = ((ParserDecorator)parser);
+ parser = decoration.getWrappedParser();
+ }
+ }
+
+ boolean outputParser = true;
+ List<Parser> children = Collections.emptyList();
+ if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
+ // Only output the parser, not the children
+ } else if (parser instanceof CompositeParser) {
+ children = ((CompositeParser)parser).getAllComponentParsers();
+ // Special case for a naked composite
+ if (parser.getClass().equals(CompositeParser.class)) {
+ outputParser = false;
+ }
+ // Special case for making Default to static
+ if (mode == Mode.STATIC && parser instanceof DefaultParser) {
+ outputParser = false;
+ }
+ }
+
+ if (outputParser) {
+ rootElement = addParser(rootElement, doc, parser, decoration);
+ }
+ for (Parser childParser : children) {
+ addParser(mode, rootElement, doc, childParser);
+ }
+ // TODO Parser Exclusions
+ }
+
+ private static Element addParser(Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception {
+ ParseContext context = new ParseContext();
+
+ Set<MediaType> addedTypes = new TreeSet<>();
+ Set<MediaType> excludedTypes = new TreeSet<>();
+ if (decorator != null) {
+ Set<MediaType> types = new TreeSet<>();
+ types.addAll(decorator.getSupportedTypes(context));
+ addedTypes.addAll(types);
+
+ for (MediaType type : parser.getSupportedTypes(context)) {
+ if (! types.contains(type)) {
+ excludedTypes.add(type);
+ }
+ addedTypes.remove(type);
+ }
+ }
+
+ String className = parser.getClass().getCanonicalName();
+ Element parserElement = doc.createElement("parser");
+ parserElement.setAttribute("class", className);
+ rootElement.appendChild(parserElement);
+
+ for (MediaType type : addedTypes) {
+ Element mimeElement = doc.createElement("mime");
+ mimeElement.appendChild(doc.createTextNode(type.toString()));
+ parserElement.appendChild(mimeElement);
+ }
+ for (MediaType type : excludedTypes) {
+ Element mimeElement = doc.createElement("mime-exclude");
+ mimeElement.appendChild(doc.createTextNode(type.toString()));
+ parserElement.appendChild(mimeElement);
+ }
+
+ return parserElement;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
new file mode 100644
index 0000000..01a30eb
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.config;
+
+
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class TikaConfigSerializerTest extends TikaConfigTest {
+
+ /**
+ * TIKA-1445 It should be possible to exclude DefaultParser from
+ * certain types, so another parser explicitly listed will take them
+ */
+ @Test
+ public void defaultParserWithExcludes() throws Exception {
+ String xml = loadAndSerialize("TIKA-1445-default-except.xml",
+ TikaConfigSerializer.Mode.STATIC);
+ assertContains(
+ "<parser class=\"org.apache.tika.parser.ErrorParser\">" +
+ " <mime>fail/world</mime> " +
+ "</parser>", xml);
+ }
+
+ @Test
+ @Ignore("TODO: executor-service info needs to be stored in TikaConfig for serialization")
+ public void testExecutors() throws Exception {
+ String xml = loadAndSerialize("TIKA-1762-executors.xml",
+ TikaConfigSerializer.Mode.STATIC);
+ assertContains("<executor-service class=\"org.apache.tika.config.DummyExecutor\">" +
+ " <core-threads>3</core-threads>" +
+ " <max-threads>10</max-threads>" +
+ "</executor-service>", xml);
+ }
+
+ String loadAndSerialize(String configFile, TikaConfigSerializer.Mode mode) throws Exception {
+ TikaConfig config = getConfig(configFile);
+ StringWriter writer = new StringWriter();
+ TikaConfigSerializer.serialize(config, mode, writer, StandardCharsets.UTF_8);
+ return writer.toString().replaceAll("[\r\n\t ]+", " ");
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
----------------------------------------------------------------------
diff --git a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
index 0c51634..b312032 100644
--- a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
+++ b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
@@ -24,36 +24,9 @@ import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.CompositeDetector;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.language.translate.DefaultTranslator;
-import org.apache.tika.language.translate.Translator;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.CompositeParser;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserDecorator;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
+import org.apache.tika.config.TikaConfigSerializer;
/**
@@ -67,214 +40,24 @@ import org.w3c.dom.Node;
* for your custom mime types.
*/
public class DumpTikaConfigExample {
- /**
- * @param config config file to dump
- * @param writer writer to which to write
- * @throws Exception
- */
- public void dump(TikaConfig config, Mode mode, Writer writer, String encoding) throws Exception {
- DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
- DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-
- // root elements
- Document doc = docBuilder.newDocument();
- Element rootElement = doc.createElement("properties");
-
- doc.appendChild(rootElement);
- addMimeComment(mode, rootElement, doc);
- addServiceLoader(mode, rootElement, doc, config);
- addTranslator(mode, rootElement, doc, config);
- addDetectors(mode, rootElement, doc, config);
- addParsers(mode, rootElement, doc, config);
- // TODO Service Loader section
- // now write
- TransformerFactory transformerFactory = TransformerFactory.newInstance();
- Transformer transformer = transformerFactory.newTransformer();
- transformer.setOutputProperty(OutputKeys.INDENT, "yes");
- transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
- transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
- DOMSource source = new DOMSource(doc);
- StreamResult result = new StreamResult(writer);
-
- transformer.transform(source, result);
- }
-
- private void addServiceLoader(Mode mode, Element rootElement, Document doc, TikaConfig config) {
- ServiceLoader loader = config.getServiceLoader();
-
- if (mode == Mode.MINIMAL) {
- // Is this the default?
- if (loader.isDynamic() && loader.getLoadErrorHandler() == LoadErrorHandler.IGNORE) {
- // Default config, no need to output anything
- return;
- }
- }
-
- Element dslEl = doc.createElement("service-loader");
- dslEl.setAttribute("dynamic", Boolean.toString(loader.isDynamic()));
- dslEl.setAttribute("loadErrorHandler", loader.getLoadErrorHandler().toString());
- rootElement.appendChild(dslEl);
- }
-
- private void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) {
- // Unlike the other entries, TikaConfig only wants one of
- // these, and no outer <translators> list
- Translator translator = config.getTranslator();
- if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
- Node mimeComment = doc.createComment(
- "for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
- rootElement.appendChild(mimeComment);
- } else {
- if (translator instanceof DefaultTranslator && mode == Mode.STATIC) {
- translator = ((DefaultTranslator)translator).getTranslator();
- }
- if (translator != null) {
- Element translatorElement = doc.createElement("translator");
- translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
- rootElement.appendChild(translatorElement);
- } else {
- rootElement.appendChild(doc.createComment("No translators available"));
- }
- }
- }
-
- private void addMimeComment(Mode mode, Element rootElement, Document doc) {
- Node mimeComment = doc.createComment(
- "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
- rootElement.appendChild(mimeComment);
- }
-
- private void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
- Detector detector = config.getDetector();
-
- if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
- // Don't output anything, all using defaults
- Node detComment = doc.createComment(
- "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
- rootElement.appendChild(detComment);
- return;
- }
-
- Element detectorsElement = doc.createElement("detectors");
- if (mode == Mode.CURRENT && detector instanceof DefaultDetector ||
- ! (detector instanceof CompositeDetector)) {
- Element detectorElement = doc.createElement("detector");
- detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
- detectorsElement.appendChild(detectorElement);
- } else {
- List<Detector> children = ((CompositeDetector)detector).getDetectors();
- for (Detector d : children) {
- Element detectorElement = doc.createElement("detector");
- detectorElement.setAttribute("class", d.getClass().getCanonicalName());
- detectorsElement.appendChild(detectorElement);
- }
- }
- rootElement.appendChild(detectorsElement);
- }
-
- private void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
- Parser parser = config.getParser();
- if (mode == Mode.MINIMAL && parser instanceof DefaultParser) {
- // Don't output anything, all using defaults
- return;
- } else if (mode == Mode.MINIMAL) {
- mode = Mode.CURRENT;
- }
-
- Element parsersElement = doc.createElement("parsers");
- rootElement.appendChild(parsersElement);
-
- addParser(mode, parsersElement, doc, parser);
- }
- private void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
- // If the parser is decorated, is it a kind where we output the parser inside?
- ParserDecorator decoration = null;
- if (parser instanceof ParserDecorator) {
- if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) {
- decoration = ((ParserDecorator)parser);
- parser = decoration.getWrappedParser();
- }
- }
-
- boolean outputParser = true;
- List<Parser> children = Collections.emptyList();
- if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
- // Only output the parser, not the children
- } else if (parser instanceof CompositeParser) {
- children = ((CompositeParser)parser).getAllComponentParsers();
- // Special case for a naked composite
- if (parser.getClass().equals(CompositeParser.class)) {
- outputParser = false;
- }
- // Special case for making Default to static
- if (mode == Mode.STATIC && parser instanceof DefaultParser) {
- outputParser = false;
- }
- }
-
- if (outputParser) {
- rootElement = addParser(rootElement, doc, parser, decoration);
- }
- for (Parser childParser : children) {
- addParser(mode, rootElement, doc, childParser);
- }
- // TODO Parser Exclusions
- }
- private Element addParser(Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception {
- ParseContext context = new ParseContext();
-
- Set<MediaType> addedTypes = new TreeSet<>();
- Set<MediaType> excludedTypes = new TreeSet<>();
- if (decorator != null) {
- Set<MediaType> types = new TreeSet<>();
- types.addAll(decorator.getSupportedTypes(context));
- addedTypes.addAll(types);
-
- for (MediaType type : parser.getSupportedTypes(context)) {
- if (! types.contains(type)) {
- excludedTypes.add(type);
- }
- addedTypes.remove(type);
- }
- }
-
- String className = parser.getClass().getCanonicalName();
- Element parserElement = doc.createElement("parser");
- parserElement.setAttribute("class", className);
- rootElement.appendChild(parserElement);
-
- for (MediaType type : addedTypes) {
- Element mimeElement = doc.createElement("mime");
- mimeElement.appendChild(doc.createTextNode(type.toString()));
- parserElement.appendChild(mimeElement);
- }
- for (MediaType type : excludedTypes) {
- Element mimeElement = doc.createElement("mime-exclude");
- mimeElement.appendChild(doc.createTextNode(type.toString()));
- parserElement.appendChild(mimeElement);
- }
-
- return parserElement;
- }
-
/**
* @param args outputFile, outputEncoding, if args is empty, this prints to console
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Charset encoding = UTF_8;
- Mode mode = Mode.CURRENT;
+ TikaConfigSerializer.Mode mode = TikaConfigSerializer.Mode.CURRENT;
String filename = null;
for (String arg : args) {
if (arg.startsWith("-")) {
if (arg.contains("-dump-minimal")) {
- mode = Mode.MINIMAL;
+ mode = TikaConfigSerializer.Mode.MINIMAL;
} else if (arg.contains("-dump-current")) {
- mode = Mode.CURRENT;
+ mode = TikaConfigSerializer.Mode.CURRENT;
} else if (arg.contains("-dump-static")) {
- mode = Mode.STATIC;
+ mode = TikaConfigSerializer.Mode.STATIC;
} else {
System.out.println("Use:");
System.out.println(" DumpTikaConfig [--dump-minimal] [--dump-current] [--dump-static] [filename] [encoding]");
@@ -299,7 +82,7 @@ public class DumpTikaConfigExample {
}
DumpTikaConfigExample ex = new DumpTikaConfigExample();
- ex.dump(TikaConfig.getDefaultConfig(), mode, writer, encoding.name());
+ TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, encoding);
writer.flush();
@@ -308,7 +91,5 @@ public class DumpTikaConfigExample {
}
writer.close();
}
- protected enum Mode {
- MINIMAL, CURRENT, STATIC;
- }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
----------------------------------------------------------------------
diff --git a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
index 29acfab..3f40600 100644
--- a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
+++ b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
@@ -30,8 +30,8 @@ import java.io.Writer;
import java.nio.charset.Charset;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigSerializer;
import org.apache.tika.detect.CompositeDetector;
-import org.apache.tika.example.DumpTikaConfigExample.Mode;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
@@ -64,9 +64,9 @@ public class DumpTikaConfigExampleTest {
public void testDump() throws Exception {
DumpTikaConfigExample ex = new DumpTikaConfigExample();
for (Charset charset : new Charset[]{UTF_8, UTF_16LE}) {
- for (Mode mode : Mode.values()) {
+ for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
- ex.dump(TikaConfig.getDefaultConfig(), mode, writer, charset.name());
+ TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
writer.flush();
writer.close();
[2/2] tika git commit: TIKA-1657 move xmlification of TikaConfig to
tika-core. Thank you, Nick!
Posted by ta...@apache.org.
TIKA-1657 move xmlification of TikaConfig to tika-core. Thank you, Nick!
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5a341071
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5a341071
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5a341071
Branch: refs/heads/master
Commit: 5a341071532ac950efeaad222afe3e4a33bb9bee
Parents: 3aa1dca
Author: tballison <ta...@mitre.org>
Authored: Wed Mar 2 09:20:02 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Mar 2 09:20:02 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 +++
1 file changed, 3 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/5a341071/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e6603fa..08ac1dc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.13 - ???
+ * Move serialization of TikaConfig to tika-core and enable dumping
+ of the config file via tika-app (TIKA-1657).
+
* Tika now incorporates the Natural Language Toolkit (NLTK) from the
Python community as an option for Named Entity Recognition (TIKA-1876).