You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/09/12 22:42:42 UTC
svn commit: r814234 -
/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
Author: jukka
Date: Sat Sep 12 20:42:41 2009
New Revision: 814234
URL: http://svn.apache.org/viewvc?rev=814234&view=rev
Log:
TIKA-269: Ease of use -facade for Tika
Add methods to parse a document to a String.
Move duplicate code to helper methods.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=814234&r1=814233&r2=814234&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sat Sep 12 20:42:41 2009
@@ -23,11 +23,17 @@
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParsingReader;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Facade class for accessing Tika functionality. This class hides much of
@@ -48,28 +54,41 @@
/**
* Parses the given document and returns the extracted text content.
+ * Input metadata like a file name or a content type hint can be passed
+ * in the given metadata instance. Metadata information extracted from
+ * the document is returned in that same metadata instance.
*
* @param stream the document to be parsed
- * @result extracted text content
+ * @return extracted text content
+ * @throws IOException if the document can not be read or parsed
+ */
+ public static Reader parse(InputStream stream, Metadata metadata)
+ throws IOException {
+ return new ParsingReader(parser, stream, metadata);
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ *
+ * @param stream the document to be parsed
+ * @return extracted text content
* @throws IOException if the document can not be read or parsed
*/
public static Reader parse(InputStream stream) throws IOException {
- return new ParsingReader(parser, stream, new Metadata());
+ return parse(stream, new Metadata());
}
/**
* Parses the given file and returns the extracted text content.
*
* @param file the file to be parsed
- * @result extracted text content
+ * @return extracted text content
* @throws FileNotFoundException if the given file does not exist
* @throws IOException if the file can not be read or parsed
*/
public static Reader parse(File file)
throws FileNotFoundException, IOException {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
- return new ParsingReader(parser, new FileInputStream(file), metadata);
+ return parse(new FileInputStream(file), getFileMetadata(file));
}
/**
@@ -77,17 +96,95 @@
* text content.
*
* @param url the URL of the resource to be parsed
- * @result extracted text content
+ * @return extracted text content
* @throws IOException if the resource can not be read or parsed
*/
public static Reader parse(URL url) throws IOException {
+ return parse(url.openStream(), getUrlMetadata(url));
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ * The given input stream is closed by this method.
+ *
+ * @param stream the document to be parsed
+ * @param metadata document metadata
+ * @return extracted text content
+ * @throws IOException if the document can not be read
+ * @throws TikaException if the document can not be parsed
+ */
+ public static String parseToString(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Map<String, Object> context = new HashMap<String, Object>();
+ context.put(Parser.class.getName(), parser);
+ parser.parse(stream, handler, metadata, context);
+ return handler.toString();
+ } catch (SAXException e) {
+ // This should never happen with BodyContentHandler...
+ throw new TikaException("Unexpected SAX processing failure", e);
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Parses the given document and returns the extracted text content.
+ * The given input stream is closed by this method.
+ *
+ * @param stream the document to be parsed
+ * @return extracted text content
+ * @throws IOException if the document can not be read
+ * @throws TikaException if the document can not be parsed
+ */
+ public static String parseToString(InputStream stream)
+ throws IOException, TikaException {
+ return parseToString(stream, new Metadata());
+ }
+
+ /**
+ * Parses the given file and returns the extracted text content.
+ *
+ * @param file the file to be parsed
+ * @return extracted text content
+ * @throws FileNotFoundException if the file does not exist
+ * @throws IOException if the file can not be read
+ * @throws TikaException if the file can not be parsed
+ */
+ public static String parseToString(File file)
+ throws FileNotFoundException, IOException, TikaException {
+ return parseToString(new FileInputStream(file), getFileMetadata(file));
+ }
+
+ /**
+ * Parses the resource at the given URL and returns the extracted
+ * text content.
+ *
+ * @param url the URL of the resource to be parsed
+ * @return extracted text content
+ * @throws IOException if the resource can not be read
+ * @throws TikaException if the resource can not be parsed
+ */
+ public static String parseToString(URL url)
+ throws IOException, TikaException {
+ return parseToString(url.openStream(), getUrlMetadata(url));
+ }
+
+ private static Metadata getFileMetadata(File file) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
+ return metadata;
+ }
+
+ private static Metadata getUrlMetadata(URL url) {
Metadata metadata = new Metadata();
String path = url.getPath();
int slash = path.lastIndexOf('/');
if (slash + 1 < path.length()) {
metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
}
- return new ParsingReader(parser, url.openStream(), metadata);
+ return metadata;
}
}