You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/02 15:21:08 UTC

[1/2] tika git commit: TIKA-1657 move xmlification of TikaConfig to tika-core. Thank you, Nick!

Repository: tika
Updated Branches:
  refs/heads/master 9056894da -> 5a3410715


TIKA-1657 move xmlification of TikaConfig to tika-core.  Thank you, Nick!


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3aa1dca4
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3aa1dca4
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3aa1dca4

Branch: refs/heads/master
Commit: 3aa1dca4eef13de99b83989010fe02bfd391b378
Parents: 9056894
Author: tballison <ta...@mitre.org>
Authored: Wed Mar 2 09:18:46 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Mar 2 09:18:46 2016 -0500

----------------------------------------------------------------------
 .../main/java/org/apache/tika/cli/TikaCLI.java  |  26 +-
 .../java/org/apache/tika/cli/TikaCLITest.java   |  50 +++-
 .../test/resources/test-data/tika-config2.xml   |  14 +
 .../tika/config/TikaConfigSerializer.java       | 256 +++++++++++++++++++
 .../tika/config/TikaConfigSerializerTest.java   |  60 +++++
 .../tika/example/DumpTikaConfigExample.java     | 233 +----------------
 .../tika/example/DumpTikaConfigExampleTest.java |   6 +-
 7 files changed, 413 insertions(+), 232 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 50f3463..4458526 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -73,6 +73,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.Tika;
 import org.apache.tika.batch.BatchProcessDriverCLI;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigSerializer;
 import org.apache.tika.detect.CompositeDetector;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
@@ -326,6 +327,8 @@ public class TikaCLI {
 
     private Parser parser;
 
+    private TikaConfig config;
+
     private String configFilePath;
 
     private OutputType type = XML;
@@ -405,6 +408,15 @@ public class TikaCLI {
         } else if (arg.startsWith("--compare-file-magic=")) {
             pipeMode = false;
             compareFileMagic(arg.substring(arg.indexOf('=')+1));
+        } else if (arg.equals("--dump-minimal-config")) {
+            pipeMode = false;
+            dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
+        } else if (arg.equals("--dump-current-config")) {
+            pipeMode = false;
+            dumpConfig(TikaConfigSerializer.Mode.CURRENT);
+        } else if (arg.equals("--dump-static-config")) {
+            pipeMode = false;
+            dumpConfig(TikaConfigSerializer.Mode.STATIC);
         } else if (arg.equals("--container-aware")
                 || arg.equals("--container-aware-detector")) {
             // ignore, as container-aware detectors are now always used
@@ -497,6 +509,13 @@ public class TikaCLI {
         }
     }
 
+    private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception {
+        TikaConfig localConfig = (config == null) ? TikaConfig.getDefaultConfig() : config;
+
+        TikaConfigSerializer.serialize(localConfig, mode,
+                new OutputStreamWriter(System.out, UTF_8), UTF_8);
+    }
+
     private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
         Metadata metadata = new Metadata();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
@@ -541,7 +560,10 @@ public class TikaCLI {
         out.println("    -f  or --fork          Use Fork Mode for out-of-process extraction");
         out.println();
         out.println("    --config=<tika-config.xml>");
-        out.println("        TikaConfig file. Must be specified before -g, -s or -f!");
+        out.println("        TikaConfig file. Must be specified before -g, -s, -f or the dump-x-config !");
+        out.println("    --dump-minimal-config  Print minimal TikaConfig");
+        out.println("    --dump-current-config  Print current TikaConfig");
+        out.println("    --dump-static-config   Print static config");
         out.println("");
         out.println("    -x  or --xml           Output XHTML content (default)");
         out.println("    -h  or --html          Output HTML content");
@@ -673,7 +695,7 @@ public class TikaCLI {
 
     private void configure(String configFilePath) throws Exception {
         this.configFilePath = configFilePath;
-        TikaConfig config = new TikaConfig(new File(configFilePath));
+        config = new TikaConfig(new File(configFilePath));
         parser = new AutoDetectParser(config);
         if (digester != null) {
             parser = new DigestingParser(parser, digester);

http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index f9d5a5d..9fc8ee8 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -377,7 +377,6 @@ public class TikaCLITest {
                 "    \"Character-Count-With-Spaces\": \"31\","));
         assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\""));
         assertFalse(content.contains("X-TIKA:content"));
-
     }
 
     @Test
@@ -406,4 +405,53 @@ public class TikaCLITest {
         assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\""));
     }
 
+    @Test
+    public void testConfigSerializationStaticAndCurrent() throws Exception {
+        String[] params = new String[]{"--dump-static-config"};
+        TikaCLI.main(params);
+        String content = outContent.toString(UTF_8.name());
+        //make sure at least one detector is there
+        assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
+        //make sure Executable is there because follow on tests of custom config
+        //test that it has been turned off.
+        assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
+
+        params = new String[]{"--dump-current-config"};
+        TikaCLI.main(params);
+        content = outContent.toString(UTF_8.name());
+        //make sure at least one detector is there
+        assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
+        //and at least one parser
+        assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
+    }
+
+    @Test
+    public void testConfigSerializationCustomMinimal() throws Exception {
+        String[] params = new String[]{
+                "--config=" + testDataFile.toString() + "/tika-config2.xml",
+                "--dump-minimal-config"};
+        TikaCLI.main(params);
+        String content = outContent.toString(UTF_8.name()).replaceAll("[\r\n\t ]+", " ");
+
+        String expected =
+                "<parser class=\"org.apache.tika.parser.DefaultParser\">" +
+                        " <mime-exclude>application/pdf</mime-exclude>" +
+                        " <mime-exclude>image/jpeg</mime-exclude> " +
+                        "</parser> " +
+                        "<parser class=\"org.apache.tika.parser.EmptyParser\">" +
+                        " <mime>application/pdf</mime> " +
+                        "</parser>";
+        assertTrue(content.contains(expected));
+    }
+
+    @Test
+    public void testConfigSerializationCustomStatic() throws Exception {
+        String[] params = new String[]{
+                "--config=" + testDataFile.toString() + "/tika-config2.xml", "--dump-static-config"};
+        TikaCLI.main(params);
+        String content = outContent.toString(UTF_8.name());
+        assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
+    }
+
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/test/resources/test-data/tika-config2.xml
----------------------------------------------------------------------
diff --git a/tika-app/src/test/resources/test-data/tika-config2.xml b/tika-app/src/test/resources/test-data/tika-config2.xml
new file mode 100644
index 0000000..3a511ed
--- /dev/null
+++ b/tika-app/src/test/resources/test-data/tika-config2.xml
@@ -0,0 +1,14 @@
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <mime-exclude>image/jpeg</mime-exclude>
+      <mime-exclude>application/pdf</mime-exclude>
+      <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/>
+      <parser-exclu class="org.apache.tika.parser.executable.ExecutableParser2"/>
+    </parser>
+    <parser class="org.apache.tika.parser.EmptyParser">
+      <mime>application/pdf</mime>
+      <no-mime>hello/world</no-mime>
+    </parser>
+  </parsers>
+</properties>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
new file mode 100644
index 0000000..3c19cfd
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.language.translate.DefaultTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+
+public class TikaConfigSerializer {
+
+    public enum Mode {
+        MINIMAL, CURRENT, STATIC;
+    }
+
+    /**
+     *
+     * @param config config to serialize
+     * @param mode serialization mode
+     * @param writer writer
+     * @param charset charset
+     * @throws Exception
+     */
+    public static void serialize(TikaConfig config, Mode mode, Writer writer, Charset charset)
+            throws Exception {
+        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+        DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+
+        // root elements
+        Document doc = docBuilder.newDocument();
+        Element rootElement = doc.createElement("properties");
+
+        doc.appendChild(rootElement);
+        addMimeComment(mode, rootElement, doc);
+        addServiceLoader(mode, rootElement, doc, config);
+        addExecutorService(mode, rootElement, doc, config);
+        addTranslator(mode, rootElement, doc, config);
+        addDetectors(mode, rootElement, doc, config);
+        addParsers(mode, rootElement, doc, config);
+        // TODO Service Loader section
+
+        // now write
+        TransformerFactory transformerFactory = TransformerFactory.newInstance();
+        Transformer transformer = transformerFactory.newTransformer();
+        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+        transformer.setOutputProperty(OutputKeys.ENCODING, charset.name());
+        DOMSource source = new DOMSource(doc);
+        StreamResult result = new StreamResult(writer);
+
+        transformer.transform(source, result);
+    }
+
+    private static void addExecutorService(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+        //TODO
+    }
+
+    private static void addServiceLoader(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+        ServiceLoader loader = config.getServiceLoader();
+
+        if (mode == Mode.MINIMAL) {
+            // Is this the default?
+            if (loader.isDynamic() && loader.getLoadErrorHandler() == LoadErrorHandler.IGNORE) {
+                // Default config, no need to output anything
+                return;
+            }
+        }
+
+        Element dslEl = doc.createElement("service-loader");
+        dslEl.setAttribute("dynamic", Boolean.toString(loader.isDynamic()));
+        dslEl.setAttribute("loadErrorHandler", loader.getLoadErrorHandler().toString());
+        rootElement.appendChild(dslEl);
+    }
+
+    private static void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+        // Unlike the other entries, TikaConfig only wants one of
+        //  these, and no outer <translators> list
+        Translator translator = config.getTranslator();
+        if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
+            Node mimeComment = doc.createComment(
+                    "for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
+            rootElement.appendChild(mimeComment);
+        } else {
+            if (translator instanceof DefaultTranslator && mode == Mode.STATIC) {
+                translator = ((DefaultTranslator)translator).getTranslator();
+            }
+            if (translator != null) {
+                Element translatorElement = doc.createElement("translator");
+                translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
+                rootElement.appendChild(translatorElement);
+            } else {
+                rootElement.appendChild(doc.createComment("No translators available"));
+            }
+        }
+    }
+
+    private static void addMimeComment(Mode mode, Element rootElement, Document doc) {
+        Node mimeComment = doc.createComment(
+                "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
+        rootElement.appendChild(mimeComment);
+    }
+
+    private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
+        Detector detector = config.getDetector();
+
+        if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
+            // Don't output anything, all using defaults
+            Node detComment = doc.createComment(
+                    "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
+            rootElement.appendChild(detComment);
+            return;
+        }
+
+        Element detectorsElement = doc.createElement("detectors");
+        if (mode == Mode.CURRENT && detector instanceof DefaultDetector ||
+                ! (detector instanceof CompositeDetector)) {
+            Element detectorElement = doc.createElement("detector");
+            detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
+            detectorsElement.appendChild(detectorElement);
+        } else {
+            List<Detector> children = ((CompositeDetector)detector).getDetectors();
+            for (Detector d : children) {
+                Element detectorElement = doc.createElement("detector");
+                detectorElement.setAttribute("class", d.getClass().getCanonicalName());
+                detectorsElement.appendChild(detectorElement);
+            }
+        }
+        rootElement.appendChild(detectorsElement);
+    }
+
+    private static void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
+        Parser parser = config.getParser();
+        if (mode == Mode.MINIMAL && parser instanceof DefaultParser) {
+            // Don't output anything, all using defaults
+            return;
+        } else if (mode == Mode.MINIMAL) {
+            mode = Mode.CURRENT;
+        }
+
+        Element parsersElement = doc.createElement("parsers");
+        rootElement.appendChild(parsersElement);
+
+        addParser(mode, parsersElement, doc, parser);
+    }
+
+    private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
+        // If the parser is decorated, is it a kind where we output the parser inside?
+        ParserDecorator decoration = null;
+        if (parser instanceof ParserDecorator) {
+            if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) {
+                decoration = ((ParserDecorator)parser);
+                parser = decoration.getWrappedParser();
+            }
+        }
+
+        boolean outputParser = true;
+        List<Parser> children = Collections.emptyList();
+        if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
+            // Only output the parser, not the children
+        } else if (parser instanceof CompositeParser) {
+            children = ((CompositeParser)parser).getAllComponentParsers();
+            // Special case for a naked composite
+            if (parser.getClass().equals(CompositeParser.class)) {
+                outputParser = false;
+            }
+            // Special case for making Default to static
+            if (mode == Mode.STATIC && parser instanceof DefaultParser) {
+                outputParser = false;
+            }
+        }
+
+        if (outputParser) {
+            rootElement = addParser(rootElement, doc, parser, decoration);
+        }
+        for (Parser childParser : children) {
+            addParser(mode, rootElement, doc, childParser);
+        }
+        // TODO Parser Exclusions
+    }
+
+    private static Element addParser(Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception {
+        ParseContext context = new ParseContext();
+
+        Set<MediaType> addedTypes = new TreeSet<>();
+        Set<MediaType> excludedTypes = new TreeSet<>();
+        if (decorator != null) {
+            Set<MediaType> types = new TreeSet<>();
+            types.addAll(decorator.getSupportedTypes(context));
+            addedTypes.addAll(types);
+
+            for (MediaType type : parser.getSupportedTypes(context)) {
+                if (! types.contains(type)) {
+                    excludedTypes.add(type);
+                }
+                addedTypes.remove(type);
+            }
+        }
+
+        String className = parser.getClass().getCanonicalName();
+        Element parserElement = doc.createElement("parser");
+        parserElement.setAttribute("class", className);
+        rootElement.appendChild(parserElement);
+
+        for (MediaType type : addedTypes) {
+            Element mimeElement = doc.createElement("mime");
+            mimeElement.appendChild(doc.createTextNode(type.toString()));
+            parserElement.appendChild(mimeElement);
+        }
+        for (MediaType type : excludedTypes) {
+            Element mimeElement = doc.createElement("mime-exclude");
+            mimeElement.appendChild(doc.createTextNode(type.toString()));
+            parserElement.appendChild(mimeElement);
+        }
+
+        return parserElement;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
new file mode 100644
index 0000000..01a30eb
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.config;
+
+
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class TikaConfigSerializerTest extends TikaConfigTest {
+
+    /**
+     * TIKA-1445 It should be possible to exclude DefaultParser from
+     *  certain types, so another parser explicitly listed will take them
+     */
+    @Test
+    public void defaultParserWithExcludes() throws Exception {
+        String xml = loadAndSerialize("TIKA-1445-default-except.xml",
+                TikaConfigSerializer.Mode.STATIC);
+        assertContains(
+                "<parser class=\"org.apache.tika.parser.ErrorParser\">" +
+                " <mime>fail/world</mime> " +
+                "</parser>", xml);
+    }
+
+    @Test
+    @Ignore("TODO: executor-service info needs to be stored in TikaConfig for serialization")
+    public void testExecutors() throws Exception {
+        String xml = loadAndSerialize("TIKA-1762-executors.xml",
+                TikaConfigSerializer.Mode.STATIC);
+        assertContains("<executor-service class=\"org.apache.tika.config.DummyExecutor\">" +
+                " <core-threads>3</core-threads>" +
+                " <max-threads>10</max-threads>" +
+                "</executor-service>", xml);
+    }
+
+    String loadAndSerialize(String configFile, TikaConfigSerializer.Mode mode) throws Exception {
+        TikaConfig config = getConfig(configFile);
+        StringWriter writer = new StringWriter();
+        TikaConfigSerializer.serialize(config, mode, writer, StandardCharsets.UTF_8);
+        return writer.toString().replaceAll("[\r\n\t ]+", " ");
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
----------------------------------------------------------------------
diff --git a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
index 0c51634..b312032 100644
--- a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
+++ b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
@@ -24,36 +24,9 @@ import java.io.OutputStreamWriter;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
 
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
-import org.apache.tika.config.LoadErrorHandler;
-import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.CompositeDetector;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.language.translate.DefaultTranslator;
-import org.apache.tika.language.translate.Translator;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.CompositeParser;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserDecorator;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
+import org.apache.tika.config.TikaConfigSerializer;
 
 
 /**
@@ -67,214 +40,24 @@ import org.w3c.dom.Node;
  * for your custom mime types.
  */
 public class DumpTikaConfigExample {
-    /**
-     * @param config config file to dump
-     * @param writer writer to which to write
-     * @throws Exception
-     */
-    public void dump(TikaConfig config, Mode mode, Writer writer, String encoding) throws Exception {
-        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
-        DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
-        
-        // root elements
-        Document doc = docBuilder.newDocument();
-        Element rootElement = doc.createElement("properties");
-
-        doc.appendChild(rootElement);
-        addMimeComment(mode, rootElement, doc);
-        addServiceLoader(mode, rootElement, doc, config);
-        addTranslator(mode, rootElement, doc, config);
-        addDetectors(mode, rootElement, doc, config);
-        addParsers(mode, rootElement, doc, config);
-        // TODO Service Loader section
 
-        // now write
-        TransformerFactory transformerFactory = TransformerFactory.newInstance();
-        Transformer transformer = transformerFactory.newTransformer();
-        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
-        transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
-        DOMSource source = new DOMSource(doc);
-        StreamResult result = new StreamResult(writer);
-
-        transformer.transform(source, result);
-    }
-
-    private void addServiceLoader(Mode mode, Element rootElement, Document doc, TikaConfig config) {
-        ServiceLoader loader = config.getServiceLoader();
-        
-        if (mode == Mode.MINIMAL) {
-            // Is this the default?
-            if (loader.isDynamic() && loader.getLoadErrorHandler() == LoadErrorHandler.IGNORE) {
-                // Default config, no need to output anything
-                return;
-            }
-        }
-        
-        Element dslEl = doc.createElement("service-loader");
-        dslEl.setAttribute("dynamic", Boolean.toString(loader.isDynamic()));
-        dslEl.setAttribute("loadErrorHandler", loader.getLoadErrorHandler().toString());
-        rootElement.appendChild(dslEl);
-    }
-    
-    private void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) {
-        // Unlike the other entries, TikaConfig only wants one of
-        //  these, and no outer <translators> list
-        Translator translator = config.getTranslator();
-        if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
-            Node mimeComment = doc.createComment(
-                    "for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
-            rootElement.appendChild(mimeComment);
-        } else {
-            if (translator instanceof DefaultTranslator && mode == Mode.STATIC) {
-                translator = ((DefaultTranslator)translator).getTranslator();
-            }
-            if (translator != null) {
-                Element translatorElement = doc.createElement("translator");
-                translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
-                rootElement.appendChild(translatorElement);
-            } else {
-                rootElement.appendChild(doc.createComment("No translators available"));
-            }
-        }
-    }
-
-    private void addMimeComment(Mode mode, Element rootElement, Document doc) {
-        Node mimeComment = doc.createComment(
-                "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
-        rootElement.appendChild(mimeComment);
-    }
-
-    private void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
-        Detector detector = config.getDetector();
-        
-        if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
-            // Don't output anything, all using defaults
-            Node detComment = doc.createComment(
-                    "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
-            rootElement.appendChild(detComment);
-            return;
-        }
-        
-        Element detectorsElement = doc.createElement("detectors");
-        if (mode == Mode.CURRENT && detector instanceof DefaultDetector ||
-            ! (detector instanceof CompositeDetector)) {
-            Element detectorElement = doc.createElement("detector");
-            detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
-            detectorsElement.appendChild(detectorElement);
-        } else {
-            List<Detector> children = ((CompositeDetector)detector).getDetectors();
-            for (Detector d : children) {
-                Element detectorElement = doc.createElement("detector");
-                detectorElement.setAttribute("class", d.getClass().getCanonicalName());
-                detectorsElement.appendChild(detectorElement);
-            }
-        }
-        rootElement.appendChild(detectorsElement);
-    }
-
-    private void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
-        Parser parser = config.getParser();
-        if (mode == Mode.MINIMAL && parser instanceof DefaultParser) {
-            // Don't output anything, all using defaults
-            return;
-        } else if (mode == Mode.MINIMAL) {
-            mode = Mode.CURRENT;
-        }
-
-        Element parsersElement = doc.createElement("parsers");
-        rootElement.appendChild(parsersElement);
-        
-        addParser(mode, parsersElement, doc, parser);
-    }
-    private void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
-        // If the parser is decorated, is it a kind where we output the parser inside?
-        ParserDecorator decoration = null;
-        if (parser instanceof ParserDecorator) {
-            if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) {
-                decoration = ((ParserDecorator)parser);
-                parser = decoration.getWrappedParser();
-            }
-        }
-        
-        boolean outputParser = true;
-        List<Parser> children = Collections.emptyList();
-        if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
-            // Only output the parser, not the children
-        } else if (parser instanceof CompositeParser) {
-            children = ((CompositeParser)parser).getAllComponentParsers();
-            // Special case for a naked composite
-            if (parser.getClass().equals(CompositeParser.class)) {
-                outputParser = false;
-            }
-            // Special case for making Default to static
-            if (mode == Mode.STATIC && parser instanceof DefaultParser) {
-                outputParser = false;
-            }
-        }
-        
-        if (outputParser) {
-            rootElement = addParser(rootElement, doc, parser, decoration);
-        }
-        for (Parser childParser : children) {
-            addParser(mode, rootElement, doc, childParser);
-        }
-        // TODO Parser Exclusions
-    }
-    private Element addParser(Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception {
-        ParseContext context = new ParseContext();
-        
-        Set<MediaType> addedTypes = new TreeSet<>();
-        Set<MediaType> excludedTypes = new TreeSet<>();
-        if (decorator != null) {
-            Set<MediaType> types = new TreeSet<>();
-            types.addAll(decorator.getSupportedTypes(context));
-            addedTypes.addAll(types);
-            
-            for (MediaType type : parser.getSupportedTypes(context)) {
-                if (! types.contains(type)) {
-                    excludedTypes.add(type);
-                }
-                addedTypes.remove(type);
-            }
-        }
-        
-        String className = parser.getClass().getCanonicalName();
-        Element parserElement = doc.createElement("parser");
-        parserElement.setAttribute("class", className);
-        rootElement.appendChild(parserElement);
-        
-        for (MediaType type : addedTypes) {
-            Element mimeElement = doc.createElement("mime");
-            mimeElement.appendChild(doc.createTextNode(type.toString()));
-            parserElement.appendChild(mimeElement);
-        }
-        for (MediaType type : excludedTypes) {
-            Element mimeElement = doc.createElement("mime-exclude");
-            mimeElement.appendChild(doc.createTextNode(type.toString()));
-            parserElement.appendChild(mimeElement);
-        }
-        
-        return parserElement;
-    }
-    
     /**
      * @param args outputFile, outputEncoding, if args is empty, this prints to console
      * @throws Exception
      */
     public static void main(String[] args) throws Exception {
         Charset encoding = UTF_8;
-        Mode mode = Mode.CURRENT;
+        TikaConfigSerializer.Mode mode = TikaConfigSerializer.Mode.CURRENT;
         String filename = null;
         
         for (String arg : args) {
             if (arg.startsWith("-")) {
                 if (arg.contains("-dump-minimal")) {
-                    mode = Mode.MINIMAL;
+                    mode = TikaConfigSerializer.Mode.MINIMAL;
                 } else if (arg.contains("-dump-current")) {
-                    mode = Mode.CURRENT;
+                    mode = TikaConfigSerializer.Mode.CURRENT;
                 } else if (arg.contains("-dump-static")) {
-                    mode = Mode.STATIC;
+                    mode = TikaConfigSerializer.Mode.STATIC;
                 } else {
                     System.out.println("Use:");
                     System.out.println("  DumpTikaConfig [--dump-minimal] [--dump-current] [--dump-static] [filename] [encoding]");
@@ -299,7 +82,7 @@ public class DumpTikaConfigExample {
         }
         
         DumpTikaConfigExample ex = new DumpTikaConfigExample();
-        ex.dump(TikaConfig.getDefaultConfig(), mode, writer, encoding.name());
+        TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, encoding);
 
         writer.flush();
 
@@ -308,7 +91,5 @@ public class DumpTikaConfigExample {
         }
         writer.close();
     }
-    protected enum Mode {
-        MINIMAL, CURRENT, STATIC;
-    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
----------------------------------------------------------------------
diff --git a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
index 29acfab..3f40600 100644
--- a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
+++ b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
@@ -30,8 +30,8 @@ import java.io.Writer;
 import java.nio.charset.Charset;
 
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigSerializer;
 import org.apache.tika.detect.CompositeDetector;
-import org.apache.tika.example.DumpTikaConfigExample.Mode;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.Parser;
@@ -64,9 +64,9 @@ public class DumpTikaConfigExampleTest {
     public void testDump() throws Exception {
         DumpTikaConfigExample ex = new DumpTikaConfigExample();
         for (Charset charset : new Charset[]{UTF_8, UTF_16LE}) {
-            for (Mode mode : Mode.values()) {
+            for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
                 Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
-                ex.dump(TikaConfig.getDefaultConfig(), mode, writer, charset.name());
+                TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
                 writer.flush();
                 writer.close();
     


[2/2] tika git commit: TIKA-1657 move xmlification of TikaConfig to tika-core. Thank you, Nick!

Posted by ta...@apache.org.
TIKA-1657 move xmlification of TikaConfig to tika-core.  Thank you, Nick!


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5a341071
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5a341071
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5a341071

Branch: refs/heads/master
Commit: 5a341071532ac950efeaad222afe3e4a33bb9bee
Parents: 3aa1dca
Author: tballison <ta...@mitre.org>
Authored: Wed Mar 2 09:20:02 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Mar 2 09:20:02 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt | 3 +++
 1 file changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/5a341071/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e6603fa..08ac1dc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.13 - ???
 
+  * Move serialization of TikaConfig to tika-core and enable dumping
+    of the config file via tika-app (TIKA-1657).
+
   * Tika now incorporates the Natural Language Toolkit (NLTK) from the
     Python community as an option for Named Entity Recognition (TIKA-1876).