You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/08/10 08:24:57 UTC

svn commit: r1694961 - /tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

Author: nick
Date: Mon Aug 10 06:24:57 2015
New Revision: 1694961

URL: http://svn.apache.org/r1694961
Log:
Several people on StackOverflow are getting confused by this example, show how to use AutoDetectParser first, all the components second

Modified:
    tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java

Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694961&r1=1694960&r2=1694961&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 06:24:57 2015
@@ -19,11 +19,13 @@ import java.io.File;
 import org.apache.commons.io.FileUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.language.LanguageIdentifier;
 import org.apache.tika.language.LanguageProfile;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -33,14 +35,45 @@ import org.xml.sax.ContentHandler;
  * Demonstrates how to call the different components within Tika: its
  * {@link Detector} framework (aka MIME identification and repository), its
  * {@link Parser} interface, its {@link LanguageIdentifier} and other goodies.
+ * It also shows the "easy way" via {@link AutoDetectParser}
  */
 
 @SuppressWarnings("deprecation")
 public class MyFirstTika {
     public static void main(String[] args) throws Exception {
         String filename = args[0];
-        MimeTypes mimeRegistry = TikaConfig.getDefaultConfig()
-                .getMimeRepository();
+        TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+        
+        Metadata metadata = new Metadata();
+        String text = parseUsingComponents(filename, tikaConfig, metadata);
+        System.out.println("Parsed Metadata: ");
+        System.out.println(metadata);
+        System.out.println("Parsed Text: ");
+        System.out.println(text);
+        
+        System.out.println("-------------------------");
+        
+        metadata = new Metadata();
+        text = parseUsingAutoDetect(filename, tikaConfig, metadata);
+        System.out.println("Parsed Metadata: ");
+        System.out.println(metadata);
+        System.out.println("Parsed Text: ");
+        System.out.println(text);
+    }
+    
+    public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, 
+            Metadata metadata) throws Exception {
+        System.out.println("Handling using AutoDetectParser: [" + filename + "]");
+        
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        ContentHandler handler = new BodyContentHandler();
+        TikaInputStream stream = TikaInputStream.get(new File(filename));
+        parser.parse(stream, handler, metadata, new ParseContext());
+        return handler.toString();
+    }
+    public static String parseUsingComponents(String filename, TikaConfig tikaConfig, 
+            Metadata metadata) throws Exception {    
+        MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
 
         System.out.println("Examining: [" + filename + "]");
 
@@ -51,8 +84,7 @@ public class MyFirstTika {
                 + mimeRegistry.getMimeType(new File(filename)) + "]");
 
         Detector mimeDetector = (Detector) mimeRegistry;
-        System.out
-        .println("The MIME type (based on the Detector interface) is: ["
+        System.out.println("The MIME type (based on the Detector interface) is: ["
                 + mimeDetector.detect(new File(filename).toURI().toURL()
                         .openStream(), new Metadata()) + "]");
 
@@ -62,16 +94,12 @@ public class MyFirstTika {
         System.out.println("The language of this content is: ["
                 + lang.getLanguage() + "]");
 
-        Parser parser = TikaConfig.getDefaultConfig().getParser(
+        Parser parser = tikaConfig.getParser(
                 MediaType.parse(mimeRegistry.getMimeType(filename).getName()));
-        Metadata parsedMet = new Metadata();
         ContentHandler handler = new BodyContentHandler();
         parser.parse(new File(filename).toURI().toURL().openStream(), handler,
-                parsedMet, new ParseContext());
-
-        System.out.println("Parsed Metadata: ");
-        System.out.println(parsedMet);
-        System.out.println("Parsed Text: ");
-        System.out.println(handler.toString());
+                metadata, new ParseContext());
+        
+        return handler.toString();
     }
 }