You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/09/12 22:42:42 UTC
svn commit: r814234 - /lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java

Author: jukka
Date: Sat Sep 12 20:42:41 2009
New Revision: 814234

URL: http://svn.apache.org/viewvc?rev=814234&view=rev
Log:
TIKA-269: Ease of use -facade for Tika

Add methods to parse a document to a String.

Move duplicate code to helper methods.

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=814234&r1=814233&r2=814234&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sat Sep 12 20:42:41 2009
@@ -23,11 +23,17 @@
 import java.io.InputStream;
 import java.io.Reader;
 import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
 
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParsingReader;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Facade class for accessing Tika functionality. This class hides much of
@@ -48,28 +54,41 @@
 
     /**
      * Parses the given document and returns the extracted text content.
+     * Input metadata like a file name or a content type hint can be passed
+     * in the given metadata instance. Metadata information extracted from
+     * the document is returned in that same metadata instance.
      *
      * @param stream the document to be parsed
-     * @result extracted text content
+     * @return extracted text content
+     * @throws IOException if the document can not be read or parsed
+     */
+    public static Reader parse(InputStream stream, Metadata metadata)
+            throws IOException {
+        return new ParsingReader(parser, stream, metadata);
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     *
+     * @param stream the document to be parsed
+     * @return extracted text content
      * @throws IOException if the document can not be read or parsed
      */
     public static Reader parse(InputStream stream) throws IOException {
-        return new ParsingReader(parser, stream, new Metadata());
+        return parse(stream, new Metadata());
     }
 
     /**
      * Parses the given file and returns the extracted text content.
      *
      * @param file the file to be parsed
-     * @result extracted text content
+     * @return extracted text content
      * @throws FileNotFoundException if the given file does not exist
      * @throws IOException if the file can not be read or parsed
      */
     public static Reader parse(File file)
             throws FileNotFoundException, IOException {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
-        return new ParsingReader(parser, new FileInputStream(file), metadata);
+        return parse(new FileInputStream(file), getFileMetadata(file));
     }
 
     /**
@@ -77,17 +96,95 @@
      * text content.
      *
      * @param url the URL of the resource to be parsed
-     * @result extracted text content
+     * @return extracted text content
      * @throws IOException if the resource can not be read or parsed
      */
     public static Reader parse(URL url) throws IOException {
+        return parse(url.openStream(), getUrlMetadata(url));
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     * The given input stream is closed by this method.
+     *
+     * @param stream the document to be parsed
+     * @param metadata document metadata
+     * @return extracted text content
+     * @throws IOException if the document can not be read
+     * @throws TikaException if the document can not be parsed
+     */
+    public static String parseToString(InputStream stream, Metadata metadata)
+            throws IOException, TikaException {
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Map<String, Object> context = new HashMap<String, Object>();
+            context.put(Parser.class.getName(), parser);
+            parser.parse(stream, handler, metadata, context);
+            return handler.toString();
+        } catch (SAXException e) {
+            // This should never happen with BodyContentHandler...
+            throw new TikaException("Unexpected SAX processing failure", e);
+        } finally {
+            stream.close();
+        }
+    }
+
+    /**
+     * Parses the given document and returns the extracted text content.
+     * The given input stream is closed by this method.
+     *
+     * @param stream the document to be parsed
+     * @return extracted text content
+     * @throws IOException if the document can not be read
+     * @throws TikaException if the document can not be parsed
+     */
+    public static String parseToString(InputStream stream)
+            throws IOException, TikaException {
+        return parseToString(stream, new Metadata());
+    }
+
+    /**
+     * Parses the given file and returns the extracted text content.
+     *
+     * @param file the file to be parsed
+     * @return extracted text content
+     * @throws FileNotFoundException if the file does not exist
+     * @throws IOException if the file can not be read
+     * @throws TikaException if the file can not be parsed
+     */
+    public static String parseToString(File file)
+            throws FileNotFoundException, IOException, TikaException {
+        return parseToString(new FileInputStream(file), getFileMetadata(file));
+    }
+
+    /**
+     * Parses the resource at the given URL and returns the extracted
+     * text content.
+     *
+     * @param url the URL of the resource to be parsed
+     * @return extracted text content
+     * @throws IOException if the resource can not be read
+     * @throws TikaException if the resource can not be parsed
+     */
+    public static String parseToString(URL url)
+            throws IOException, TikaException {
+        return parseToString(url.openStream(), getUrlMetadata(url));
+    }
+
+    private static Metadata getFileMetadata(File file) {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
+        return metadata;
+    }
+
+    private static Metadata getUrlMetadata(URL url) {
         Metadata metadata = new Metadata();
         String path = url.getPath();
         int slash = path.lastIndexOf('/');
         if (slash + 1 < path.length()) {
             metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
         }
-        return new ParsingReader(parser, url.openStream(), metadata);
+        return metadata;
     }
 
 }