You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/10 08:24:57 UTC
svn commit: r1694961 -
/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Author: nick
Date: Mon Aug 10 06:24:57 2015
New Revision: 1694961
URL: http://svn.apache.org/r1694961
Log:
Several people on StackOverflow are getting confused by this example, show how to use AutoDetectParser first, all the components second
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694961&r1=1694960&r2=1694961&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 06:24:57 2015
@@ -19,11 +19,13 @@ import java.io.File;
import org.apache.commons.io.FileUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.language.LanguageProfile;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
@@ -33,14 +35,45 @@ import org.xml.sax.ContentHandler;
* Demonstrates how to call the different components within Tika: its
* {@link Detector} framework (aka MIME identification and repository), its
* {@link Parser} interface, its {@link LanguageIdentifier} and other goodies.
+ * It also shows the "easy way" via {@link AutoDetectParser}
*/
@SuppressWarnings("deprecation")
public class MyFirstTika {
public static void main(String[] args) throws Exception {
String filename = args[0];
- MimeTypes mimeRegistry = TikaConfig.getDefaultConfig()
- .getMimeRepository();
+ TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+ Metadata metadata = new Metadata();
+ String text = parseUsingComponents(filename, tikaConfig, metadata);
+ System.out.println("Parsed Metadata: ");
+ System.out.println(metadata);
+ System.out.println("Parsed Text: ");
+ System.out.println(text);
+
+ System.out.println("-------------------------");
+
+ metadata = new Metadata();
+ text = parseUsingAutoDetect(filename, tikaConfig, metadata);
+ System.out.println("Parsed Metadata: ");
+ System.out.println(metadata);
+ System.out.println("Parsed Text: ");
+ System.out.println(text);
+ }
+
+ public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig,
+ Metadata metadata) throws Exception {
+ System.out.println("Handling using AutoDetectParser: [" + filename + "]");
+
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ ContentHandler handler = new BodyContentHandler();
+ TikaInputStream stream = TikaInputStream.get(new File(filename));
+ parser.parse(stream, handler, metadata, new ParseContext());
+ return handler.toString();
+ }
+ public static String parseUsingComponents(String filename, TikaConfig tikaConfig,
+ Metadata metadata) throws Exception {
+ MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
System.out.println("Examining: [" + filename + "]");
@@ -51,8 +84,7 @@ public class MyFirstTika {
+ mimeRegistry.getMimeType(new File(filename)) + "]");
Detector mimeDetector = (Detector) mimeRegistry;
- System.out
- .println("The MIME type (based on the Detector interface) is: ["
+ System.out.println("The MIME type (based on the Detector interface) is: ["
+ mimeDetector.detect(new File(filename).toURI().toURL()
.openStream(), new Metadata()) + "]");
@@ -62,16 +94,12 @@ public class MyFirstTika {
System.out.println("The language of this content is: ["
+ lang.getLanguage() + "]");
- Parser parser = TikaConfig.getDefaultConfig().getParser(
+ Parser parser = tikaConfig.getParser(
MediaType.parse(mimeRegistry.getMimeType(filename).getName()));
- Metadata parsedMet = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(new File(filename).toURI().toURL().openStream(), handler,
- parsedMet, new ParseContext());
-
- System.out.println("Parsed Metadata: ");
- System.out.println(parsedMet);
- System.out.println("Parsed Text: ");
- System.out.println(handler.toString());
+ metadata, new ParseContext());
+
+ return handler.toString();
}
}