You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/06/06 17:05:18 UTC
svn commit: r663973 - in /incubator/tika/trunk: CHANGES.txt
src/main/java/org/apache/tika/parser/ParsingReader.java
src/test/java/org/apache/tika/parser/ParsingReaderTest.java
Author: jukka
Date: Fri Jun 6 08:05:17 2008
New Revision: 663973
URL: http://svn.apache.org/viewvc?rev=663973&view=rev
Log:
TIKA-143: Add ParsingReader
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
Modified:
incubator/tika/trunk/CHANGES.txt
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=663973&r1=663972&r2=663973&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Jun 6 08:05:17 2008
@@ -48,7 +48,10 @@
20. TIKA-139 - Add a composite parser (Jukka Zitting)
-21. TIKA-142 - Include application/xhtml+xml as valid mime type for XMLParser (mattmann)
+21. TIKA-142 - Include application/xhtml+xml as valid mime type for XMLParser
+ (mattmann)
+
+22. TIKA-143 - Add ParsingReader (Jukka Zitting)
Release 0.1-incubating - 12/27/2007
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java?rev=663973&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParsingReader.java Fri Jun 6 08:05:17 2008
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PipedReader;
+import java.io.PipedWriter;
+import java.io.Reader;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Reader for the text content from a given binary stream. This class
+ * starts a background thread and uses a {@link Parser}
+ * ({@link AutoDetectParser) by default) to parse the text content from
+ * a given input stream. The {@link BodyContentHandler} class and a pipe
+ * is used to convert the push-based SAX event stream to the pull-based
+ * character stream defined by the {@link Reader} interface.
+ *
+ * @since Apache Tika 0.2
+ */
+public class ParsingReader extends Reader {
+
+ /**
+ * Parser instance used for parsing the given binary stream.
+ */
+ private final Parser parser;
+
+ /**
+ * Read end of the pipe.
+ */
+ private final PipedReader reader;
+
+ /**
+ * Write end of the pipe.
+ */
+ private final PipedWriter writer;
+
+ /**
+ * The binary stream being parsed.
+ */
+ private final InputStream stream;
+
+ /**
+ * Metadata associated with the document being parsed.
+ */
+ private final Metadata metadata;
+
+ /**
+ * An exception (if any) thrown by the parsing thread.
+ */
+ private Throwable throwable;
+
+ /**
+ * Utility method that returns a {@link Metadata} instance
+ * for a document with the given name.
+ *
+ * @param name resource name (or <code>null</code>)
+ * @return metadata instance
+ */
+ private static Metadata getMetadata(String name) {
+ Metadata metadata = new Metadata();
+ if (name != null && name.length() > 0) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+ return metadata;
+ }
+
+ /**
+ * Creates a reader for the text content of the given binary stream.
+ *
+ * @param stream binary stream
+ */
+ public ParsingReader(InputStream stream) {
+ this(new AutoDetectParser(), stream, new Metadata());
+ }
+
+ /**
+ * Creates a reader for the text content of the given binary stream
+ * with the given name.
+ *
+ * @param stream binary stream
+ * @param name document name
+ */
+ public ParsingReader(InputStream stream, String name) {
+ this(new AutoDetectParser(), stream, getMetadata(name));
+ }
+
+ /**
+ * Creates a reader for the text content of the given file.
+ *
+ * @param file file
+ */
+ public ParsingReader(File file) throws FileNotFoundException {
+ this(new FileInputStream(file), file.getName());
+ }
+
+ /**
+ * Creates a reader for the text content of the given binary stream
+ * with the given document metadata. The given parser is used for
+ * parsing.
+ *
+ * @param parser parser instance
+ * @param stream binary stream
+ * @param metadata document metadata
+ */
+ public ParsingReader(Parser parser, InputStream stream, Metadata metadata) {
+ this.parser = parser;
+ this.reader = new PipedReader();
+ try {
+ this.writer = new PipedWriter(reader);
+ } catch (IOException e) {
+ throw new IllegalStateException(e); // Should never happen
+ }
+ this.stream = stream;
+ this.metadata = metadata;
+
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = "Apache Tika: " + name;
+ } else {
+ name = "Apache Tika";
+ }
+ new Thread(new ParsingThread(), name).start();
+ }
+
+ /**
+ * The background parsing thread.
+ */
+ private class ParsingThread implements Runnable {
+
+ /**
+ * Parses the given binary stream and writes the text content
+ * to the write end of the pipe. Potential exceptions (including
+ * the one caused if the read end is closed unexpectedly) are
+ * stored before the input stream is closed and processing is stopped.
+ */
+ public void run() {
+ try {
+ ContentHandler handler = new BodyContentHandler(writer);
+ parser.parse(stream, handler, metadata);
+ } catch (Throwable t) {
+ throwable = t;
+ }
+
+ try {
+ stream.close();
+ } catch (Throwable t) {
+ if (throwable == null) {
+ throwable = t;
+ }
+ }
+
+ try {
+ writer.close();
+ } catch (Throwable t) {
+ if (throwable == null) {
+ throwable = t;
+ }
+ }
+ }
+
+ }
+
+ /**
+ * Reads parsed text from the pipe connected to the parsing thread.
+ * Fails if the parsing thread has thrown an exception.
+ *
+ * @param cbuff character buffer
+ * @param off start offset within the buffer
+ * @param len maximum number of characters to read
+ * @throws IOException if the parsing thread has failed or
+ * if for some reason the pipe does not work properly
+ */
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ if (throwable instanceof IOException) {
+ throw (IOException) throwable;
+ } else if (throwable != null) {
+ IOException exception = new IOException("");
+ exception.initCause(throwable);
+ throw exception;
+ }
+ return reader.read(cbuf, off, len);
+ }
+
+ /**
+ * Closes the read end of the pipe. If the parsing thread is still
+ * running, next write to the pipe will fail and cause the thread
+ * to stop. Thus there is no need to explicitly terminate the thread.
+ *
+ * @throws IOException if the pipe can not be closed
+ */
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+
+}
Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=663973&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/ParsingReaderTest.java Fri Jun 6 08:05:17 2008
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Reader;
+
+import junit.framework.TestCase;
+
+public class ParsingReaderTest extends TestCase {
+
+ public void testPlainText() throws Exception {
+ String data = "test content";
+ InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
+ Reader reader = new ParsingReader(stream, "test.txt");
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('s', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals(' ', reader.read());
+ assertEquals('c', reader.read());
+ assertEquals('o', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals(-1, reader.read());
+ reader.close();
+ assertEquals(-1, stream.read());
+ }
+
+ public void testXML() throws Exception {
+ String data = "<p>test <span>content</span></p>";
+ InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
+ Reader reader = new ParsingReader(stream, "test.xml");
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('s', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals(' ', reader.read());
+ assertEquals('c', reader.read());
+ assertEquals('o', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals(-1, reader.read());
+ reader.close();
+ assertEquals(-1, stream.read());
+ }
+
+}