You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/09/23 23:54:22 UTC

svn commit: r1704943 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java

Author: nick
Date: Wed Sep 23 21:54:21 2015
New Revision: 1704943

URL: http://svn.apache.org/viewvc?rev=1704943&view=rev
Log:
Parser updates for config dumping

Modified:
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java

Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1704943&r1=1704942&r2=1704943&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Wed Sep 23 21:54:21 2015
@@ -20,15 +20,12 @@ package org.apache.tika.example;
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 import java.io.FileOutputStream;
-import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
-import java.util.TreeMap;
 import java.util.TreeSet;
 
 import javax.xml.parsers.DocumentBuilder;
@@ -43,13 +40,14 @@ import org.apache.tika.config.TikaConfig
 import org.apache.tika.detect.CompositeDetector;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.language.translate.DefaultTranslator;
 import org.apache.tika.language.translate.Translator;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
@@ -84,6 +82,7 @@ public class DumpTikaConfigExample {
         addTranslator(mode, rootElement, doc, config);
         addDetectors(mode, rootElement, doc, config);
         addParsers(mode, rootElement, doc, config);
+        // TODO Service Loader section
 
         // now write
         TransformerFactory transformerFactory = TransformerFactory.newInstance();
@@ -130,6 +129,9 @@ public class DumpTikaConfigExample {
         
         if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
             // Don't output anything, all using defaults
+            Node detComment = doc.createComment(
+                    "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
+            rootElement.appendChild(detComment);
             return;
         }
         
@@ -151,54 +153,83 @@ public class DumpTikaConfigExample {
     }
 
     private void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
-        Map<String, Parser> parsers = getConcreteParsers(config.getParser());
-
-        Element parsersElement = doc.createElement("parsers");
-        rootElement.appendChild(parsersElement);
-
-        ParseContext context = new ParseContext();
-        for (Map.Entry<String, Parser> e : parsers.entrySet()) {
-            Element parserElement = doc.createElement("parser");
-            Parser child = e.getValue();
-            String className = e.getKey();
-            parserElement.setAttribute("class", className);
-            Set<MediaType> types = new TreeSet<>();
-            types.addAll(child.getSupportedTypes(context));
-            for (MediaType type : types) {
-                Element mimeElement = doc.createElement("mime");
-                mimeElement.appendChild(doc.createTextNode(type.toString()));
-                parserElement.appendChild(mimeElement);
+        Parser parser = config.getParser();
+        if (mode == Mode.MINIMAL && parser instanceof DefaultParser) {
+            // Don't output anything, all using defaults
+            return;
+        } else if (mode == Mode.MINIMAL) {
+            mode = Mode.CURRENT;
+        }
+        addParsers(mode, rootElement, doc, parser);
+    }
+    private void addParsers(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
+        Parser realParser = parser;
+        if (parser instanceof ParserDecorator) {
+            realParser = ((ParserDecorator)parser).getWrappedParser();
+        }
+        
+        List<Parser> children = null;
+        if (mode == Mode.CURRENT && realParser instanceof DefaultParser) {
+            // Don't output any children
+            // TODO List excluded children
+        } else if (realParser instanceof CompositeParser) {
+            children = ((CompositeParser)realParser).getAllComponentParsers();
+            if (realParser instanceof DefaultParser || parser == realParser) {
+                realParser = null;
             }
-            parsersElement.appendChild(parserElement);
         }
+        
+        Element parsersElement = doc.createElement("parsers");
         rootElement.appendChild(parsersElement);
-
-    }
-
-    private Map<String, Parser> getConcreteParsers(Parser parentParser) throws TikaException, IOException {
-        Map<String, Parser> parsers = new TreeMap<>();
-        if (parentParser instanceof CompositeParser) {
-            addParsers((CompositeParser) parentParser, parsers);
-        } else {
-            addParser(parentParser, parsers);
+        Element addParserTo = parsersElement;
+        
+        if (realParser != null) {
+            addParserTo = addParser(addParserTo, doc, parser, realParser);
         }
-        return parsers;
-    }
-
-    private void addParsers(CompositeParser p, Map<String, Parser> parsers) {
-        for (Parser child : p.getParsers().values()) {
-            if (child instanceof CompositeParser) {
-                addParsers((CompositeParser) child, parsers);
-            } else {
-                addParser(child, parsers);
+        if (children != null && !children.isEmpty()) {
+            for (Parser p : children) {
+                addParser(addParserTo, doc, p, p);
             }
         }
     }
-
-    private void addParser(Parser p, Map<String, Parser> parsers) {
-        parsers.put(p.getClass().getCanonicalName(), p);
+    private Element addParser(Element rootElement, Document doc, Parser parser, Parser realParser) throws Exception {
+        ParseContext context = new ParseContext();
+        
+        Set<MediaType> types = new TreeSet<>();
+        Set<MediaType> addedTypes = new TreeSet<>();
+        Set<MediaType> excludedTypes = new TreeSet<>();
+        types.addAll(parser.getSupportedTypes(context));
+        
+        for (MediaType type : realParser.getSupportedTypes(context)) {
+            if (! types.contains(type)) {
+                excludedTypes.add(type);
+            }
+            addedTypes.remove(type);
+        }
+        
+        String className = realParser.getClass().getCanonicalName();
+        Element parserElement = doc.createElement("parser");
+        parserElement.setAttribute("class", className);
+        rootElement.appendChild(parserElement);
+        
+        for (MediaType type : addedTypes) {
+            Element mimeElement = doc.createElement("mime");
+            mimeElement.appendChild(doc.createTextNode(type.toString()));
+            parserElement.appendChild(mimeElement);
+        }
+        for (MediaType type : excludedTypes) {
+            Element mimeElement = doc.createElement("mime-exclude");
+            mimeElement.appendChild(doc.createTextNode(type.toString()));
+            parserElement.appendChild(mimeElement);
+        }
+        
+        if (realParser instanceof CompositeParser) {
+            // TODO Recurse
+        }
+        
+        return parserElement;
     }
-
+    
     /**
      * @param args outputFile, outputEncoding, if args is empty, this prints to console
      * @throws Exception