You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/09/19 16:02:16 UTC

svn commit: r1626222 - in /tika/trunk: tika-app/src/test/resources/test-data/ tika-example/src/main/java/org/apache/tika/example/ tika-example/src/test/java/org/apache/tika/example/

Author: tallison
Date: Fri Sep 19 14:02:16 2014
New Revision: 1626222

URL: http://svn.apache.org/r1626222
Log:
TIKA-1418 add files

Added:
    tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml
    tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
    tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java

Added: tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml Fri Sep 19 14:02:16 2014
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<grocery_list>
+    <item>apple</item>
+    <item>orange</item>
+    <item>pear<item>
+</grocery_list>
\ No newline at end of file

Added: tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml Fri Sep 19 14:02:16 2014
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<properties>
+  <parsers>
+
+    <parser class="org.apache.tika.parser.html.HtmlParser">
+      <mime>application/vnd.wap.xhtml+xml</mime>
+      <mime>application/x-asp</mime>
+      <mime>application/xhtml+xml</mime>
+      <mime>text/html</mime>
+      <mime>application/xml</mime>
+      <mime>text/xml</mime>
+    </parser>
+  </parsers>
+</properties>

Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Fri Sep 19 14:02:16 2014
@@ -0,0 +1,212 @@
+package org.apache.tika.example;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.translate.DefaultTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+
+/**
+ * This class shows how to dump a TikaConfig object to a configuration file.
+ * This allows users to easily dump the default TikaConfig as a base from which
+ * to start if they want to modify the default configuration file.
+ * <p>
+ * For those who want to modify the mimes file, take a look at
+ * tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+ * for inspiration.  Consider adding org/apache/tika/mime/custom-mimetypes.xml
+ * for your custom mime types.
+ */
+public class DumpTikaConfigExample {
+
+    /**
+     *
+     * @param config config file to dump
+     * @param writer writer to which to write
+     * @throws Exception
+     */
+    public void dump(TikaConfig config, Writer writer, String encoding) throws Exception {
+        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+        DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+        // root elements
+        Document doc = docBuilder.newDocument();
+        Element rootElement = doc.createElement("properties");
+
+        doc.appendChild(rootElement);
+        addMimeComment(rootElement, doc);
+        addTranslator(rootElement, doc, config);
+        addDetectors(rootElement, doc, config);
+        addParsers(rootElement, doc, config);
+
+
+        //now write
+        TransformerFactory transformerFactory = TransformerFactory.newInstance();
+        Transformer transformer = transformerFactory.newTransformer();
+        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+        transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
+        DOMSource source = new DOMSource(doc);
+        StreamResult result = new StreamResult(writer);
+
+        transformer.transform(source, result);
+    }
+
+    private void addTranslator(Element rootElement, Document doc, TikaConfig config) {
+        //TikaConfig only reads the first translator from the list,
+        //but it looks like it expects a list
+        Translator translator = config.getTranslator();
+        if (translator instanceof DefaultTranslator) {
+            Node mimeComment = doc.createComment(
+                    "for example: "+
+                            "<translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
+            rootElement.appendChild(mimeComment);
+        } else {
+            Element translatorElement = doc.createElement("translator");
+            translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
+            rootElement.appendChild(translatorElement);
+        }
+    }
+
+    private void addMimeComment(Element rootElement, Document doc) {
+        Node mimeComment = doc.createComment(
+                "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
+        rootElement.appendChild(mimeComment);
+    }
+
+    private void addDetectors(Element rootElement, Document doc, TikaConfig config) throws Exception {
+        Detector detector = config.getDetector();
+        Element detectorsElement = doc.createElement("detectors");
+
+        if (detector instanceof DefaultDetector) {
+            List<Detector> children = ((DefaultDetector)detector).getDetectors();
+            for (Detector d : children) {
+                Element detectorElement = doc.createElement("detector");
+                detectorElement.setAttribute("class", d.getClass().getCanonicalName());
+                detectorsElement.appendChild(detectorElement);
+            }
+        }
+        rootElement.appendChild(detectorsElement);
+    }
+
+    private void addParsers(Element rootElement, Document doc, TikaConfig config) throws Exception {
+        Map<String, Parser> parsers = getConcreteParsers(config.getParser());
+
+        Element parsersElement = doc.createElement("parsers");
+        rootElement.appendChild(parsersElement);
+
+        ParseContext context = new ParseContext();
+        for (Map.Entry<String, Parser> e : parsers.entrySet()) {
+            Element parserElement = doc.createElement("parser");
+            Parser child = e.getValue();
+            String className = e.getKey();
+            parserElement.setAttribute("class", className);
+            Set<MediaType> types = new TreeSet<MediaType>();
+            types.addAll(child.getSupportedTypes(context));
+            for (MediaType type : types){
+                Element mimeElement = doc.createElement("mime");
+                mimeElement.appendChild(doc.createTextNode(type.toString()));
+                parserElement.appendChild(mimeElement);
+            }
+            parsersElement.appendChild(parserElement);
+        }
+        rootElement.appendChild(parsersElement);
+
+    }
+
+    private Map<String, Parser> getConcreteParsers(Parser parentParser)throws TikaException, IOException  {
+        Map<String, Parser> parsers = new TreeMap<String, Parser>();
+        if (parentParser instanceof CompositeParser) {
+            addParsers((CompositeParser)parentParser, parsers);
+        } else {
+            addParser(parentParser, parsers);
+        }
+        return parsers;
+    }
+
+    private void addParsers(CompositeParser p, Map<String, Parser> parsers) {
+        for (Parser child : p.getParsers().values()) {
+            System.out.println(child.getClass().getName());
+            if (child instanceof CompositeParser) {
+                addParsers((CompositeParser)child, parsers);
+            } else {
+                addParser(child, parsers);
+            }
+        }
+    }
+
+    private void addParser(Parser p, Map<String, Parser> parsers) {
+        parsers.put(p.getClass().getCanonicalName(), p);
+    }
+
+    /**
+     *
+     * @param args outputFile, outputEncoding, if args is empty, this prints to console
+     * @throws Exception
+     */
+    public static void main(String[] args) throws Exception {
+
+        String encoding = "UTF-8";
+        Writer writer = null;
+        if (args.length > 0) {
+            writer = new OutputStreamWriter(new FileOutputStream(new File(args[0])));
+        } else {
+            writer = new StringWriter();
+        }
+
+        if (args.length > 1) {
+            encoding = args[1];
+        }
+        DumpTikaConfigExample ex = new DumpTikaConfigExample();
+        ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);
+
+        writer.flush();
+
+        if (writer instanceof StringWriter) {
+            System.out.println(writer.toString());
+        }
+        writer.close();
+    }
+}

Added: tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java (added)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java Fri Sep 19 14:02:16 2014
@@ -0,0 +1,83 @@
+package org.apache.tika.example;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.Parser;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import static junit.framework.TestCase.assertEquals;
+import static junit.framework.TestCase.assertTrue;
+
+public class DumpTikaConfigExampleTest {
+    private File configFile;
+    @Before
+    public void setUp() {
+        try {
+            configFile = File.createTempFile("tmp", ".xml");
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to create tmp file");
+        }
+    }
+
+    @After
+    public void tearDown() {
+        if (configFile != null && configFile.exists()) {
+            configFile.delete();
+        }
+        if (configFile != null && configFile.exists()) {
+            throw new RuntimeException("Failed to clean up: "+configFile.getAbsolutePath());
+        }
+    }
+
+    @Test
+    public void testDump() throws Exception {
+        DumpTikaConfigExample ex = new DumpTikaConfigExample();
+        for (String encoding : new String[]{ "UTF-8", "UTF-16LE"}) {
+            Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), encoding);
+            ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);
+            writer.flush();
+            writer.close();
+
+            TikaConfig c = new TikaConfig(configFile);
+            assertEquals(CompositeParser.class, c.getParser().getClass());
+            assertEquals(CompositeDetector.class, c.getDetector().getClass());
+
+            CompositeParser p = (CompositeParser) c.getParser();
+            assertTrue("enough parsers?", p.getParsers().size() > 130);
+
+            CompositeDetector d = (CompositeDetector) c.getDetector();
+            assertTrue("enough detectors?", d.getDetectors().size() > 3);
+            //just try to load it into autodetect to make sure no errors are thrown
+            Parser auto = new AutoDetectParser(c);
+        }
+    }
+
+}