You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/09/19 16:02:16 UTC
svn commit: r1626222 - in /tika/trunk: tika-app/src/test/resources/test-data/
tika-example/src/main/java/org/apache/tika/example/
tika-example/src/test/java/org/apache/tika/example/
Author: tallison
Date: Fri Sep 19 14:02:16 2014
New Revision: 1626222
URL: http://svn.apache.org/r1626222
Log:
TIKA-1418 add files
Added:
tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml
tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
Added: tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml Fri Sep 19 14:02:16 2014
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<grocery_list>
+ <item>apple</item>
+ <item>orange</item>
+ <item>pear<item>
+</grocery_list>
\ No newline at end of file
Added: tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml Fri Sep 19 14:02:16 2014
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<properties>
+ <parsers>
+
+ <parser class="org.apache.tika.parser.html.HtmlParser">
+ <mime>application/vnd.wap.xhtml+xml</mime>
+ <mime>application/x-asp</mime>
+ <mime>application/xhtml+xml</mime>
+ <mime>text/html</mime>
+ <mime>application/xml</mime>
+ <mime>text/xml</mime>
+ </parser>
+ </parsers>
+</properties>
Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (added)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Fri Sep 19 14:02:16 2014
@@ -0,0 +1,212 @@
+package org.apache.tika.example;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.translate.DefaultTranslator;
+import org.apache.tika.language.translate.Translator;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+
+/**
+ * This class shows how to dump a TikaConfig object to a configuration file.
+ * This allows users to easily dump the default TikaConfig as a base from which
+ * to start if they want to modify the default configuration file.
+ * <p>
+ * For those who want to modify the mimes file, take a look at
+ * tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+ * for inspiration. Consider adding org/apache/tika/mime/custom-mimetypes.xml
+ * for your custom mime types.
+ */
+public class DumpTikaConfigExample {
+
+ /**
+ *
+ * @param config config file to dump
+ * @param writer writer to which to write
+ * @throws Exception
+ */
+ public void dump(TikaConfig config, Writer writer, String encoding) throws Exception {
+ DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+ // root elements
+ Document doc = docBuilder.newDocument();
+ Element rootElement = doc.createElement("properties");
+
+ doc.appendChild(rootElement);
+ addMimeComment(rootElement, doc);
+ addTranslator(rootElement, doc, config);
+ addDetectors(rootElement, doc, config);
+ addParsers(rootElement, doc, config);
+
+
+ //now write
+ TransformerFactory transformerFactory = TransformerFactory.newInstance();
+ Transformer transformer = transformerFactory.newTransformer();
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+ transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
+ transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
+ DOMSource source = new DOMSource(doc);
+ StreamResult result = new StreamResult(writer);
+
+ transformer.transform(source, result);
+ }
+
+ private void addTranslator(Element rootElement, Document doc, TikaConfig config) {
+ //TikaConfig only reads the first translator from the list,
+ //but it looks like it expects a list
+ Translator translator = config.getTranslator();
+ if (translator instanceof DefaultTranslator) {
+ Node mimeComment = doc.createComment(
+ "for example: "+
+ "<translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
+ rootElement.appendChild(mimeComment);
+ } else {
+ Element translatorElement = doc.createElement("translator");
+ translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
+ rootElement.appendChild(translatorElement);
+ }
+ }
+
+ private void addMimeComment(Element rootElement, Document doc) {
+ Node mimeComment = doc.createComment(
+ "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
+ rootElement.appendChild(mimeComment);
+ }
+
+ private void addDetectors(Element rootElement, Document doc, TikaConfig config) throws Exception {
+ Detector detector = config.getDetector();
+ Element detectorsElement = doc.createElement("detectors");
+
+ if (detector instanceof DefaultDetector) {
+ List<Detector> children = ((DefaultDetector)detector).getDetectors();
+ for (Detector d : children) {
+ Element detectorElement = doc.createElement("detector");
+ detectorElement.setAttribute("class", d.getClass().getCanonicalName());
+ detectorsElement.appendChild(detectorElement);
+ }
+ }
+ rootElement.appendChild(detectorsElement);
+ }
+
+ private void addParsers(Element rootElement, Document doc, TikaConfig config) throws Exception {
+ Map<String, Parser> parsers = getConcreteParsers(config.getParser());
+
+ Element parsersElement = doc.createElement("parsers");
+ rootElement.appendChild(parsersElement);
+
+ ParseContext context = new ParseContext();
+ for (Map.Entry<String, Parser> e : parsers.entrySet()) {
+ Element parserElement = doc.createElement("parser");
+ Parser child = e.getValue();
+ String className = e.getKey();
+ parserElement.setAttribute("class", className);
+ Set<MediaType> types = new TreeSet<MediaType>();
+ types.addAll(child.getSupportedTypes(context));
+ for (MediaType type : types){
+ Element mimeElement = doc.createElement("mime");
+ mimeElement.appendChild(doc.createTextNode(type.toString()));
+ parserElement.appendChild(mimeElement);
+ }
+ parsersElement.appendChild(parserElement);
+ }
+ rootElement.appendChild(parsersElement);
+
+ }
+
+ private Map<String, Parser> getConcreteParsers(Parser parentParser)throws TikaException, IOException {
+ Map<String, Parser> parsers = new TreeMap<String, Parser>();
+ if (parentParser instanceof CompositeParser) {
+ addParsers((CompositeParser)parentParser, parsers);
+ } else {
+ addParser(parentParser, parsers);
+ }
+ return parsers;
+ }
+
+ private void addParsers(CompositeParser p, Map<String, Parser> parsers) {
+ for (Parser child : p.getParsers().values()) {
+ System.out.println(child.getClass().getName());
+ if (child instanceof CompositeParser) {
+ addParsers((CompositeParser)child, parsers);
+ } else {
+ addParser(child, parsers);
+ }
+ }
+ }
+
+ private void addParser(Parser p, Map<String, Parser> parsers) {
+ parsers.put(p.getClass().getCanonicalName(), p);
+ }
+
+ /**
+ *
+ * @param args outputFile, outputEncoding, if args is empty, this prints to console
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+
+ String encoding = "UTF-8";
+ Writer writer = null;
+ if (args.length > 0) {
+ writer = new OutputStreamWriter(new FileOutputStream(new File(args[0])));
+ } else {
+ writer = new StringWriter();
+ }
+
+ if (args.length > 1) {
+ encoding = args[1];
+ }
+ DumpTikaConfigExample ex = new DumpTikaConfigExample();
+ ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);
+
+ writer.flush();
+
+ if (writer instanceof StringWriter) {
+ System.out.println(writer.toString());
+ }
+ writer.close();
+ }
+}
Added: tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java?rev=1626222&view=auto
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java (added)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java Fri Sep 19 14:02:16 2014
@@ -0,0 +1,83 @@
+package org.apache.tika.example;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.Parser;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import static junit.framework.TestCase.assertEquals;
+import static junit.framework.TestCase.assertTrue;
+
+public class DumpTikaConfigExampleTest {
+ private File configFile;
+ @Before
+ public void setUp() {
+ try {
+ configFile = File.createTempFile("tmp", ".xml");
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to create tmp file");
+ }
+ }
+
+ @After
+ public void tearDown() {
+ if (configFile != null && configFile.exists()) {
+ configFile.delete();
+ }
+ if (configFile != null && configFile.exists()) {
+ throw new RuntimeException("Failed to clean up: "+configFile.getAbsolutePath());
+ }
+ }
+
+ @Test
+ public void testDump() throws Exception {
+ DumpTikaConfigExample ex = new DumpTikaConfigExample();
+ for (String encoding : new String[]{ "UTF-8", "UTF-16LE"}) {
+ Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), encoding);
+ ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);
+ writer.flush();
+ writer.close();
+
+ TikaConfig c = new TikaConfig(configFile);
+ assertEquals(CompositeParser.class, c.getParser().getClass());
+ assertEquals(CompositeDetector.class, c.getDetector().getClass());
+
+ CompositeParser p = (CompositeParser) c.getParser();
+ assertTrue("enough parsers?", p.getParsers().size() > 130);
+
+ CompositeDetector d = (CompositeDetector) c.getDetector();
+ assertTrue("enough detectors?", d.getDetectors().size() > 3);
+ //just try to load it into autodetect to make sure no errors are thrown
+ Parser auto = new AutoDetectParser(c);
+ }
+ }
+
+}