You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/09/23 23:04:08 UTC
svn commit: r1704934 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/language/translate/
tika-example/src/main/java/org/apache/tika/example/
tika-example/src/test/java/org/apache/tika/example/
Author: nick
Date: Wed Sep 23 21:04:08 2015
New Revision: 1704934
URL: http://svn.apache.org/viewvc?rev=1704934&view=rev
Log:
TIKA-1657 Update the example of dumping a Tika Config to support different output modes, for Translators and Detectors
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java?rev=1704934&r1=1704933&r2=1704934&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java Wed Sep 23 21:04:08 2015
@@ -99,6 +99,19 @@ public class DefaultTranslator implement
}
throw new TikaException("No translators currently available");
}
+
+ /**
+ * Returns all available translators
+ */
+ public List<Translator> getTranslators() {
+ return getDefaultTranslators(loader);
+ }
+ /**
+ * Returns the current translator
+ */
+ public Translator getTranslator() {
+ return getFirstAvailable(loader);
+ }
public boolean isAvailable() {
return getFirstAvailable(loader) != null;
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1704934&r1=1704933&r2=1704934&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Wed Sep 23 21:04:08 2015
@@ -17,7 +17,8 @@
package org.apache.tika.example;
-import java.io.File;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
@@ -29,6 +30,7 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
+
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
@@ -38,6 +40,7 @@ import javax.xml.transform.dom.DOMSource
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@@ -51,8 +54,6 @@ import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* This class shows how to dump a TikaConfig object to a configuration file.
@@ -70,21 +71,21 @@ public class DumpTikaConfigExample {
* @param writer writer to which to write
* @throws Exception
*/
- public void dump(TikaConfig config, Writer writer, String encoding) throws Exception {
+ public void dump(TikaConfig config, Mode mode, Writer writer, String encoding) throws Exception {
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+
// root elements
Document doc = docBuilder.newDocument();
Element rootElement = doc.createElement("properties");
doc.appendChild(rootElement);
- addMimeComment(rootElement, doc);
- addTranslator(rootElement, doc, config);
- addDetectors(rootElement, doc, config);
- addParsers(rootElement, doc, config);
+ addMimeComment(mode, rootElement, doc);
+ addTranslator(mode, rootElement, doc, config);
+ addDetectors(mode, rootElement, doc, config);
+ addParsers(mode, rootElement, doc, config);
-
- //now write
+ // now write
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
@@ -96,33 +97,50 @@ public class DumpTikaConfigExample {
transformer.transform(source, result);
}
- private void addTranslator(Element rootElement, Document doc, TikaConfig config) {
- //TikaConfig only reads the first translator from the list,
- //but it looks like it expects a list
+ private void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) {
+ // TikaConfig only reads the first translator from the list,
+ // but it looks like it expects a list
Translator translator = config.getTranslator();
- if (translator instanceof DefaultTranslator) {
+ if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
Node mimeComment = doc.createComment(
"for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
rootElement.appendChild(mimeComment);
} else {
- Element translatorElement = doc.createElement("translator");
- translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
- rootElement.appendChild(translatorElement);
+ if (translator instanceof DefaultTranslator && mode == Mode.STATIC) {
+ translator = ((DefaultTranslator)translator).getTranslator();
+ }
+ if (translator != null) {
+ Element translatorElement = doc.createElement("translator");
+ translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
+ rootElement.appendChild(translatorElement);
+ } else {
+ rootElement.appendChild(doc.createComment("No translators available"));
+ }
}
}
- private void addMimeComment(Element rootElement, Document doc) {
+ private void addMimeComment(Mode mode, Element rootElement, Document doc) {
Node mimeComment = doc.createComment(
"for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
rootElement.appendChild(mimeComment);
}
- private void addDetectors(Element rootElement, Document doc, TikaConfig config) throws Exception {
+ private void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
Detector detector = config.getDetector();
+
+ if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
+ // Don't output anything, all using defaults
+ return;
+ }
+
Element detectorsElement = doc.createElement("detectors");
-
- if (detector instanceof DefaultDetector) {
- List<Detector> children = ((DefaultDetector) detector).getDetectors();
+ if (mode == Mode.CURRENT && detector instanceof DefaultDetector ||
+ ! (detector instanceof CompositeDetector)) {
+ Element detectorElement = doc.createElement("detector");
+ detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
+ detectorsElement.appendChild(detectorElement);
+ } else {
+ List<Detector> children = ((CompositeDetector)detector).getDetectors();
for (Detector d : children) {
Element detectorElement = doc.createElement("detector");
detectorElement.setAttribute("class", d.getClass().getCanonicalName());
@@ -132,7 +150,7 @@ public class DumpTikaConfigExample {
rootElement.appendChild(detectorsElement);
}
- private void addParsers(Element rootElement, Document doc, TikaConfig config) throws Exception {
+ private void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
Map<String, Parser> parsers = getConcreteParsers(config.getParser());
Element parsersElement = doc.createElement("parsers");
@@ -187,18 +205,42 @@ public class DumpTikaConfigExample {
*/
public static void main(String[] args) throws Exception {
Charset encoding = UTF_8;
+ Mode mode = Mode.CURRENT;
+ String filename = null;
+
+ for (String arg : args) {
+ if (arg.startsWith("-")) {
+ if (arg.contains("-dump-minimal")) {
+ mode = Mode.MINIMAL;
+ } else if (arg.contains("-dump-current")) {
+ mode = Mode.CURRENT;
+ } else if (arg.contains("-dump-static")) {
+ mode = Mode.STATIC;
+ } else {
+ System.out.println("Use:");
+ System.out.println(" DumpTikaConfig [--dump-minimal] [--dump-current] [--dump-static] [filename] [encoding]");
+ System.out.println("");
+ System.out.println("--dump-minimal Produce the minimal config file");
+ System.out.println("--dump-current The current (with defaults) config file");
+ System.out.println("--dump-static Convert dynamic parts to static");
+ return;
+ }
+ } else if (filename == null) {
+ filename = arg;
+ } else {
+ encoding = Charset.forName(arg);
+ }
+ }
+
Writer writer = null;
- if (args.length > 0) {
- writer = new OutputStreamWriter(new FileOutputStream(new File(args[0])), encoding);
+ if (filename != null) {
+ writer = new OutputStreamWriter(new FileOutputStream(filename), encoding);
} else {
writer = new StringWriter();
}
-
- if (args.length > 1) {
- encoding = Charset.forName(args[1]);
- }
+
DumpTikaConfigExample ex = new DumpTikaConfigExample();
- ex.dump(TikaConfig.getDefaultConfig(), writer, encoding.name());
+ ex.dump(TikaConfig.getDefaultConfig(), mode, writer, encoding.name());
writer.flush();
@@ -207,4 +249,7 @@ public class DumpTikaConfigExample {
}
writer.close();
}
+ protected enum Mode {
+ MINIMAL, CURRENT, STATIC;
+ }
}
Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java?rev=1704934&r1=1704933&r2=1704934&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java (original)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java Wed Sep 23 21:04:08 2015
@@ -17,10 +17,9 @@ package org.apache.tika.example;
* limitations under the License.
*/
-
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
@@ -32,6 +31,7 @@ import java.nio.charset.Charset;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.example.DumpTikaConfigExample.Mode;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
@@ -39,7 +39,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-
public class DumpTikaConfigExampleTest {
private File configFile;
@Before
@@ -65,22 +64,26 @@ public class DumpTikaConfigExampleTest {
public void testDump() throws Exception {
DumpTikaConfigExample ex = new DumpTikaConfigExample();
for (Charset charset : new Charset[]{UTF_8, UTF_16LE}) {
- Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
- ex.dump(TikaConfig.getDefaultConfig(), writer, charset.name());
- writer.flush();
- writer.close();
-
- TikaConfig c = new TikaConfig(configFile);
- assertEquals(CompositeParser.class, c.getParser().getClass());
- assertEquals(CompositeDetector.class, c.getDetector().getClass());
-
- CompositeParser p = (CompositeParser) c.getParser();
- assertTrue("enough parsers?", p.getParsers().size() > 130);
-
- CompositeDetector d = (CompositeDetector) c.getDetector();
- assertTrue("enough detectors?", d.getDetectors().size() > 3);
- //just try to load it into autodetect to make sure no errors are thrown
- Parser auto = new AutoDetectParser(c);
+ for (Mode mode : Mode.values()) {
+ Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
+ ex.dump(TikaConfig.getDefaultConfig(), mode, writer, charset.name());
+ writer.flush();
+ writer.close();
+
+ TikaConfig c = new TikaConfig(configFile);
+ assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
+ assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
+
+ CompositeParser p = (CompositeParser) c.getParser();
+ assertTrue("enough parsers?", p.getParsers().size() > 130);
+
+ CompositeDetector d = (CompositeDetector) c.getDetector();
+ assertTrue("enough detectors?", d.getDetectors().size() > 3);
+
+ //just try to load it into autodetect to make sure no errors are thrown
+ Parser auto = new AutoDetectParser(c);
+ assertNotNull(auto);
+ }
}
}